Restructure: Move grid_* + vocab_* into packages (klausur-service)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -31,6 +31,7 @@
|
|||||||
|
|
||||||
# Two indivisible route handlers (~230 LOC each) that cannot be split further
|
# Two indivisible route handlers (~230 LOC each) that cannot be split further
|
||||||
**/vocab_worksheet_compare_api.py | owner=klausur | reason=compare_ocr_methods (234 LOC) + analyze_grid (255 LOC), each a single cohesive handler | review=2026-10-01
|
**/vocab_worksheet_compare_api.py | owner=klausur | reason=compare_ocr_methods (234 LOC) + analyze_grid (255 LOC), each a single cohesive handler | review=2026-10-01
|
||||||
|
**/vocab/worksheet/compare_api.py | owner=klausur | reason=Same file moved to vocab/ package | review=2026-10-01
|
||||||
|
|
||||||
# TypeScript Data Catalogs (admin-lehrer/lib/sdk/)
|
# TypeScript Data Catalogs (admin-lehrer/lib/sdk/)
|
||||||
# Pure exported const arrays/objects with type definitions, no business logic.
|
# Pure exported const arrays/objects with type definitions, no business logic.
|
||||||
|
|||||||
@@ -0,0 +1,10 @@
|
|||||||
|
"""
|
||||||
|
Grid package — restructured from grid_* flat modules.
|
||||||
|
|
||||||
|
Backward-compatible re-exports: consumers can still use
|
||||||
|
``from grid_build_core import ...`` etc. via the shim files in backend/.
|
||||||
|
|
||||||
|
Sub-packages:
|
||||||
|
- grid.build — grid construction pipeline (_build_grid_core and phases)
|
||||||
|
- grid.editor — FastAPI endpoints, helper functions, column/zone logic
|
||||||
|
"""
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
"""
|
||||||
|
Grid Build sub-package — grid construction pipeline.
|
||||||
|
|
||||||
|
Modules:
|
||||||
|
- core — _build_grid_core() main entry point
|
||||||
|
- zones — image loading, graphic/box detection, zone-aware grid building
|
||||||
|
- cleanup — junk row removal, artifact cleanup, pipe dividers
|
||||||
|
- text_ops — color annotation, heading detection, IPA correction
|
||||||
|
- cell_ops — bullet removal, garbled cells, word-box reordering
|
||||||
|
- finalize — dictionary detection, spell checking, result assembly
|
||||||
|
"""
|
||||||
@@ -0,0 +1,305 @@
|
|||||||
|
"""
|
||||||
|
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
|
||||||
|
garbled cell cleanup, word-box reordering, and max_columns enforcement.
|
||||||
|
|
||||||
|
Extracted from grid_build_core.py for maintainability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
|
from cv_ocr_engines import (
|
||||||
|
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove blue bullet/artifact word_boxes (Step 5i).
|
||||||
|
|
||||||
|
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
|
||||||
|
and syllable-split word merging.
|
||||||
|
"""
|
||||||
|
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||||
|
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
|
||||||
|
|
||||||
|
bullet_removed = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
if len(wbs) < 2:
|
||||||
|
continue
|
||||||
|
to_remove: set = set()
|
||||||
|
|
||||||
|
# Rule (a): tiny coloured symbols
|
||||||
|
for i, wb in enumerate(wbs):
|
||||||
|
cn = wb.get("color_name", "black")
|
||||||
|
if (cn != "black"
|
||||||
|
and wb.get("width", 0) * wb.get("height", 0) < 200
|
||||||
|
and wb.get("conf", 100) < 85):
|
||||||
|
to_remove.add(i)
|
||||||
|
|
||||||
|
# Rule (a2): isolated non-alphanumeric symbols
|
||||||
|
for i, wb in enumerate(wbs):
|
||||||
|
t = (wb.get("text") or "").strip()
|
||||||
|
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
||||||
|
if t in _REMOVE_SYMBOLS:
|
||||||
|
to_remove.add(i)
|
||||||
|
|
||||||
|
# Rule (b) + (c): overlap and duplicate detection
|
||||||
|
to_merge: List[Tuple[int, int]] = []
|
||||||
|
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
|
||||||
|
for p in range(len(indexed) - 1):
|
||||||
|
i1, w1 = indexed[p]
|
||||||
|
i2, w2 = indexed[p + 1]
|
||||||
|
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
|
||||||
|
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
|
||||||
|
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
|
||||||
|
min_w = min(w1.get("width", 1), w2.get("width", 1))
|
||||||
|
gap = x2s - x1e
|
||||||
|
overlap_pct = overlap / min_w if min_w > 0 else 0
|
||||||
|
|
||||||
|
if overlap_pct > 0.20:
|
||||||
|
t1 = (w1.get("text") or "").strip()
|
||||||
|
t2 = (w2.get("text") or "").strip()
|
||||||
|
|
||||||
|
# Syllable-split words
|
||||||
|
if (overlap_pct <= 0.75
|
||||||
|
and _ALPHA_WORD_RE.match(t1)
|
||||||
|
and _ALPHA_WORD_RE.match(t2)):
|
||||||
|
to_merge.append((i1, i2))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# High overlap with short prefix
|
||||||
|
if (overlap_pct > 0.75
|
||||||
|
and _ALPHA_WORD_RE.match(t1)
|
||||||
|
and _ALPHA_WORD_RE.match(t2)
|
||||||
|
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
|
||||||
|
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
|
||||||
|
to_merge.append((i1, i2))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if overlap_pct <= 0.40:
|
||||||
|
continue
|
||||||
|
|
||||||
|
c1 = w1.get("conf", 50)
|
||||||
|
c2 = w2.get("conf", 50)
|
||||||
|
|
||||||
|
# Very high overlap: prefer IPA-dictionary word
|
||||||
|
if overlap_pct > 0.90 and t1.lower() != t2.lower():
|
||||||
|
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
|
||||||
|
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
|
||||||
|
if in_dict_1 and not in_dict_2:
|
||||||
|
to_remove.add(i2)
|
||||||
|
continue
|
||||||
|
elif in_dict_2 and not in_dict_1:
|
||||||
|
to_remove.add(i1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if c1 < c2:
|
||||||
|
to_remove.add(i1)
|
||||||
|
elif c2 < c1:
|
||||||
|
to_remove.add(i2)
|
||||||
|
else:
|
||||||
|
if w1.get("height", 0) > w2.get("height", 0):
|
||||||
|
to_remove.add(i1)
|
||||||
|
else:
|
||||||
|
to_remove.add(i2)
|
||||||
|
|
||||||
|
elif (gap < 6
|
||||||
|
and w1.get("color_name") == "blue"
|
||||||
|
and w2.get("color_name") == "blue"
|
||||||
|
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
|
||||||
|
c1 = w1.get("conf", 50)
|
||||||
|
c2 = w2.get("conf", 50)
|
||||||
|
to_remove.add(i1 if c1 <= c2 else i2)
|
||||||
|
|
||||||
|
# Execute merges first (syllable-split words)
|
||||||
|
if to_merge:
|
||||||
|
merge_parent: Dict[int, int] = {}
|
||||||
|
for mi1, mi2 in to_merge:
|
||||||
|
actual_mi1 = mi1
|
||||||
|
while actual_mi1 in merge_parent:
|
||||||
|
actual_mi1 = merge_parent[actual_mi1]
|
||||||
|
if actual_mi1 in to_remove or mi2 in to_remove:
|
||||||
|
continue
|
||||||
|
if mi2 in merge_parent:
|
||||||
|
continue
|
||||||
|
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
|
||||||
|
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
||||||
|
mt2 = (mw2.get("text") or "").strip()
|
||||||
|
merged_text = mt1 + mt2
|
||||||
|
mx = min(mw1["left"], mw2["left"])
|
||||||
|
my = min(mw1["top"], mw2["top"])
|
||||||
|
mr = max(mw1["left"] + mw1["width"],
|
||||||
|
mw2["left"] + mw2["width"])
|
||||||
|
mb = max(mw1["top"] + mw1["height"],
|
||||||
|
mw2["top"] + mw2["height"])
|
||||||
|
mw1["text"] = merged_text
|
||||||
|
mw1["left"] = mx
|
||||||
|
mw1["top"] = my
|
||||||
|
mw1["width"] = mr - mx
|
||||||
|
mw1["height"] = mb - my
|
||||||
|
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
||||||
|
to_remove.add(mi2)
|
||||||
|
merge_parent[mi2] = actual_mi1
|
||||||
|
bullet_removed -= 1
|
||||||
|
|
||||||
|
if to_remove:
|
||||||
|
bullet_removed += len(to_remove)
|
||||||
|
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
if not cell.get("_ipa_corrected"):
|
||||||
|
cell["text"] = _words_to_reading_order_text(filtered)
|
||||||
|
|
||||||
|
if bullet_removed:
|
||||||
|
for z in zones_data:
|
||||||
|
z["cells"] = [c for c in z.get("cells", [])
|
||||||
|
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||||
|
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
|
||||||
|
_COMMON_SHORT_WORDS = {
|
||||||
|
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
|
||||||
|
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
|
||||||
|
"die", "der", "das", "dem", "den", "des", "ein", "und",
|
||||||
|
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
|
||||||
|
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
|
||||||
|
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
|
||||||
|
"on", "or", "so", "to", "up", "us", "we",
|
||||||
|
"the", "and", "but", "for", "not",
|
||||||
|
}
|
||||||
|
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
|
||||||
|
artifact_cells_removed = 0
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
before = len(z.get("cells", []))
|
||||||
|
kept = []
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = (cell.get("text") or "").strip()
|
||||||
|
core = text.rstrip(".,;:!?'\"")
|
||||||
|
is_artifact = False
|
||||||
|
if not core:
|
||||||
|
is_artifact = True
|
||||||
|
elif _PURE_JUNK_RE.match(core):
|
||||||
|
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
|
||||||
|
is_artifact = True
|
||||||
|
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
||||||
|
is_artifact = True
|
||||||
|
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||||
|
is_artifact = True
|
||||||
|
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
|
||||||
|
and not re.match(r'^[pPsS]\.?\d+$', core)):
|
||||||
|
is_artifact = True
|
||||||
|
if is_artifact:
|
||||||
|
kept.append(None)
|
||||||
|
else:
|
||||||
|
kept.append(cell)
|
||||||
|
z["cells"] = [c for c in kept if c is not None]
|
||||||
|
artifact_cells_removed += before - len(z["cells"])
|
||||||
|
|
||||||
|
if artifact_cells_removed:
|
||||||
|
for z in zones_data:
|
||||||
|
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
|
||||||
|
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
|
||||||
|
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Normalise word_box order to reading order (Step 5j)."""
|
||||||
|
wb_reordered = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
if len(wbs) < 2:
|
||||||
|
continue
|
||||||
|
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
||||||
|
sorted_wbs = [w for line in lines for w in line]
|
||||||
|
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
|
||||||
|
cell["word_boxes"] = sorted_wbs
|
||||||
|
wb_reordered += 1
|
||||||
|
if wb_reordered:
|
||||||
|
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
||||||
|
|
||||||
|
|
||||||
|
def _enforce_max_columns(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
max_columns: int,
|
||||||
|
) -> None:
|
||||||
|
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
|
||||||
|
for z in zones_data:
|
||||||
|
if z.get("zone_type") != "content":
|
||||||
|
continue
|
||||||
|
cols = z.get("columns", [])
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if len(cols) <= max_columns:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"max_columns=%d: zone %s has %d columns -> merging",
|
||||||
|
max_columns, z.get("zone_index"), len(cols),
|
||||||
|
)
|
||||||
|
|
||||||
|
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
|
||||||
|
|
||||||
|
while len(cols) > max_columns:
|
||||||
|
narrowest = cols_by_width.pop(0)
|
||||||
|
ni = narrowest["index"]
|
||||||
|
|
||||||
|
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
|
||||||
|
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
|
||||||
|
if pos + 1 < len(sorted_by_x):
|
||||||
|
merge_target = sorted_by_x[pos + 1]
|
||||||
|
elif pos > 0:
|
||||||
|
merge_target = sorted_by_x[pos - 1]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
ti = merge_target["index"]
|
||||||
|
|
||||||
|
merge_target["x_min_px"] = min(
|
||||||
|
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
|
||||||
|
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
|
||||||
|
)
|
||||||
|
merge_target["x_max_px"] = max(
|
||||||
|
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
|
||||||
|
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
|
||||||
|
)
|
||||||
|
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
|
||||||
|
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
|
||||||
|
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
|
||||||
|
|
||||||
|
for cell in cells:
|
||||||
|
if cell.get("col_index") == ni:
|
||||||
|
cell["col_index"] = ti
|
||||||
|
existing = next(
|
||||||
|
(c for c in cells if c["col_index"] == ti
|
||||||
|
and c["row_index"] == cell["row_index"]
|
||||||
|
and c is not cell),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if existing:
|
||||||
|
existing["text"] = (
|
||||||
|
(existing.get("text", "") + " " + cell.get("text", "")).strip()
|
||||||
|
)
|
||||||
|
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
|
||||||
|
cell["_merged"] = True
|
||||||
|
|
||||||
|
z["cells"] = [c for c in cells if not c.get("_merged")]
|
||||||
|
cells = z["cells"]
|
||||||
|
cols.remove(narrowest)
|
||||||
|
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
|
||||||
|
|
||||||
|
# Re-index columns 0..N-1
|
||||||
|
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
|
||||||
|
old_idx = col["index"]
|
||||||
|
col["index"] = new_idx
|
||||||
|
for cell in cells:
|
||||||
|
if cell.get("col_index") == old_idx:
|
||||||
|
cell["col_index"] = new_idx
|
||||||
|
|
||||||
|
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
|
||||||
@@ -0,0 +1,390 @@
|
|||||||
|
"""
|
||||||
|
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
|
||||||
|
divider removal, connector normalization, border strip detection, and
|
||||||
|
alphabet sidebar removal.
|
||||||
|
|
||||||
|
Extracted from grid_build_core.py for maintainability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from cv_ocr_engines import _words_to_reading_order_text
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_PIPE_RE = re.compile(r"^\|+$")
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_zones(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
border_prefiltered: bool,
|
||||||
|
session_id: str,
|
||||||
|
) -> bool:
|
||||||
|
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zones_data: List of zone dicts (modified in place).
|
||||||
|
border_prefiltered: Whether border words were already pre-filtered.
|
||||||
|
session_id: For logging.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Updated border_prefiltered flag.
|
||||||
|
"""
|
||||||
|
_remove_junk_rows(zones_data)
|
||||||
|
_remove_artifact_cells(zones_data)
|
||||||
|
_remove_oversized_word_boxes(zones_data)
|
||||||
|
_remove_pipe_dividers(zones_data)
|
||||||
|
_normalize_connector_columns(zones_data)
|
||||||
|
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
|
||||||
|
_remove_alphabet_sidebars(zones_data)
|
||||||
|
return border_prefiltered
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove rows where ALL cells contain only short, low-confidence text.
|
||||||
|
|
||||||
|
Also removes 'oversized stub' rows and 'scattered debris' rows.
|
||||||
|
"""
|
||||||
|
_JUNK_CONF_THRESHOLD = 50
|
||||||
|
_JUNK_MAX_TEXT_LEN = 3
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
rows = z.get("rows", [])
|
||||||
|
if not cells or not rows:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute median word height across the zone for oversized detection
|
||||||
|
all_wb_heights = [
|
||||||
|
wb["height"]
|
||||||
|
for cell in cells
|
||||||
|
for wb in cell.get("word_boxes") or []
|
||||||
|
if wb.get("height", 0) > 0
|
||||||
|
]
|
||||||
|
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
|
||||||
|
|
||||||
|
junk_row_indices = set()
|
||||||
|
for row in rows:
|
||||||
|
ri = row["index"]
|
||||||
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||||
|
if not row_cells:
|
||||||
|
continue
|
||||||
|
|
||||||
|
row_wbs = [
|
||||||
|
wb for cell in row_cells
|
||||||
|
for wb in cell.get("word_boxes") or []
|
||||||
|
]
|
||||||
|
|
||||||
|
# Rule 1: ALL word_boxes are low-conf AND short text
|
||||||
|
all_junk = True
|
||||||
|
for wb in row_wbs:
|
||||||
|
text = (wb.get("text") or "").strip()
|
||||||
|
conf = wb.get("conf", 0)
|
||||||
|
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
||||||
|
all_junk = False
|
||||||
|
break
|
||||||
|
if all_junk and row_wbs:
|
||||||
|
junk_row_indices.add(ri)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Rule 2: oversized stub -- <=3 words, short total text,
|
||||||
|
# and word height > 1.8x median
|
||||||
|
if len(row_wbs) <= 3:
|
||||||
|
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
|
||||||
|
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
|
||||||
|
has_page_ref = any(
|
||||||
|
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
|
||||||
|
for wb in row_wbs
|
||||||
|
)
|
||||||
|
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
|
||||||
|
junk_row_indices.add(ri)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Rule 3: scattered debris -- rows with only tiny fragments
|
||||||
|
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
|
||||||
|
if longest <= 2:
|
||||||
|
junk_row_indices.add(ri)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if junk_row_indices:
|
||||||
|
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
||||||
|
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: removed %d junk rows from zone %d: %s",
|
||||||
|
len(junk_row_indices), z["zone_index"],
|
||||||
|
sorted(junk_row_indices),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove individual cells with a single very-short, low-conf word."""
|
||||||
|
_ARTIFACT_MAX_LEN = 2
|
||||||
|
_ARTIFACT_CONF_THRESHOLD = 65
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if not cells:
|
||||||
|
continue
|
||||||
|
artifact_ids = set()
|
||||||
|
for cell in cells:
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
if len(wbs) != 1:
|
||||||
|
continue
|
||||||
|
wb = wbs[0]
|
||||||
|
text = (wb.get("text") or "").strip()
|
||||||
|
conf = wb.get("conf", 100)
|
||||||
|
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
|
||||||
|
artifact_ids.add(cell.get("cell_id"))
|
||||||
|
if artifact_ids:
|
||||||
|
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: removed %d artifact cells from zone %d: %s",
|
||||||
|
len(artifact_ids), z.get("zone_index", 0),
|
||||||
|
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if not cells:
|
||||||
|
continue
|
||||||
|
all_wh = [
|
||||||
|
wb["height"]
|
||||||
|
for cell in cells
|
||||||
|
for wb in cell.get("word_boxes") or []
|
||||||
|
if wb.get("height", 0) > 0
|
||||||
|
]
|
||||||
|
if not all_wh:
|
||||||
|
continue
|
||||||
|
med_h = sorted(all_wh)[len(all_wh) // 2]
|
||||||
|
oversized_threshold = med_h * 3
|
||||||
|
removed_oversized = 0
|
||||||
|
for cell in cells:
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
|
||||||
|
if len(filtered) < len(wbs):
|
||||||
|
removed_oversized += len(wbs) - len(filtered)
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
cell["text"] = _words_to_reading_order_text(filtered)
|
||||||
|
if removed_oversized:
|
||||||
|
z["cells"] = [c for c in cells if c.get("word_boxes")]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
|
||||||
|
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove pipe-character word_boxes (column divider artifacts)."""
|
||||||
|
for z in zones_data:
|
||||||
|
if z.get("vsplit_group") is not None:
|
||||||
|
continue # pipes already removed before split
|
||||||
|
removed_pipes = 0
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||||||
|
if len(filtered) < len(wbs):
|
||||||
|
removed_pipes += len(wbs) - len(filtered)
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
cell["text"] = _words_to_reading_order_text(filtered)
|
||||||
|
if removed_pipes:
|
||||||
|
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: removed %d pipe-divider word_boxes from zone %d",
|
||||||
|
removed_pipes, z.get("zone_index", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Strip pipe chars ONLY from cell edges (OCR artifacts).
|
||||||
|
# Preserve pipes embedded in words as syllable separators.
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if "|" in text:
|
||||||
|
cleaned = text.strip("|").strip()
|
||||||
|
if cleaned != text.strip():
|
||||||
|
cell["text"] = cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Normalize narrow connector columns where OCR appends noise chars.
|
||||||
|
|
||||||
|
In synonym dictionaries a narrow column repeats the same word
|
||||||
|
(e.g. "oder") in every row. OCR sometimes appends noise chars.
|
||||||
|
"""
|
||||||
|
for z in zones_data:
|
||||||
|
cols = z.get("columns", [])
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if not cols or not cells:
|
||||||
|
continue
|
||||||
|
for col in cols:
|
||||||
|
ci = col.get("index")
|
||||||
|
col_cells = [c for c in cells if c.get("col_index") == ci]
|
||||||
|
if len(col_cells) < 3:
|
||||||
|
continue
|
||||||
|
text_counts: Dict[str, int] = {}
|
||||||
|
for c in col_cells:
|
||||||
|
t = (c.get("text") or "").strip()
|
||||||
|
if t:
|
||||||
|
text_counts[t] = text_counts.get(t, 0) + 1
|
||||||
|
if not text_counts:
|
||||||
|
continue
|
||||||
|
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
|
||||||
|
dominant_count = text_counts[dominant_text]
|
||||||
|
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
|
||||||
|
continue
|
||||||
|
fixed = 0
|
||||||
|
for c in col_cells:
|
||||||
|
t = (c.get("text") or "").strip()
|
||||||
|
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
|
||||||
|
c["text"] = dominant_text
|
||||||
|
wbs = c.get("word_boxes") or []
|
||||||
|
if len(wbs) == 1:
|
||||||
|
wbs[0]["text"] = dominant_text
|
||||||
|
fixed += 1
|
||||||
|
if fixed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid: normalized %d outlier cells in connector column %d "
|
||||||
|
"(dominant='%s') zone %d",
|
||||||
|
fixed, ci, dominant_text, z.get("zone_index", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_border_strips(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
border_prefiltered: bool,
|
||||||
|
) -> bool:
|
||||||
|
"""Detect and remove page-border decoration strips.
|
||||||
|
|
||||||
|
Returns updated border_prefiltered flag.
|
||||||
|
"""
|
||||||
|
border_strip_removed = 0
|
||||||
|
if border_prefiltered:
|
||||||
|
logger.info("Step 4e: skipped (border pre-filter already applied)")
|
||||||
|
return border_prefiltered
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if not cells:
|
||||||
|
continue
|
||||||
|
all_wbs_with_cell: list = []
|
||||||
|
for cell in cells:
|
||||||
|
for wb in cell.get("word_boxes") or []:
|
||||||
|
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
|
||||||
|
if len(all_wbs_with_cell) < 10:
|
||||||
|
continue
|
||||||
|
all_wbs_with_cell.sort(key=lambda t: t[0])
|
||||||
|
total = len(all_wbs_with_cell)
|
||||||
|
|
||||||
|
# -- Left-edge scan --
|
||||||
|
left_strip_count = 0
|
||||||
|
left_gap = 0
|
||||||
|
running_right = 0
|
||||||
|
for gi in range(total - 1):
|
||||||
|
running_right = max(
|
||||||
|
running_right,
|
||||||
|
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
|
||||||
|
)
|
||||||
|
gap = all_wbs_with_cell[gi + 1][0] - running_right
|
||||||
|
if gap > 30:
|
||||||
|
left_strip_count = gi + 1
|
||||||
|
left_gap = gap
|
||||||
|
break
|
||||||
|
|
||||||
|
# -- Right-edge scan --
|
||||||
|
right_strip_count = 0
|
||||||
|
right_gap = 0
|
||||||
|
running_left = all_wbs_with_cell[-1][0]
|
||||||
|
for gi in range(total - 1, 0, -1):
|
||||||
|
running_left = min(running_left, all_wbs_with_cell[gi][0])
|
||||||
|
prev_right = (
|
||||||
|
all_wbs_with_cell[gi - 1][0]
|
||||||
|
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
|
||||||
|
)
|
||||||
|
gap = running_left - prev_right
|
||||||
|
if gap > 30:
|
||||||
|
right_strip_count = total - gi
|
||||||
|
right_gap = gap
|
||||||
|
break
|
||||||
|
|
||||||
|
strip_wbs: set = set()
|
||||||
|
strip_side = ""
|
||||||
|
strip_gap = 0
|
||||||
|
strip_count = 0
|
||||||
|
if left_strip_count > 0 and left_strip_count / total < 0.20:
|
||||||
|
strip_side = "left"
|
||||||
|
strip_count = left_strip_count
|
||||||
|
strip_gap = left_gap
|
||||||
|
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
|
||||||
|
elif right_strip_count > 0 and right_strip_count / total < 0.20:
|
||||||
|
strip_side = "right"
|
||||||
|
strip_count = right_strip_count
|
||||||
|
strip_gap = right_gap
|
||||||
|
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
|
||||||
|
|
||||||
|
if not strip_wbs:
|
||||||
|
continue
|
||||||
|
for cell in cells:
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
|
||||||
|
if len(filtered) < len(wbs):
|
||||||
|
border_strip_removed += len(wbs) - len(filtered)
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
cell["text"] = _words_to_reading_order_text(filtered)
|
||||||
|
z["cells"] = [c for c in cells
|
||||||
|
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||||
|
logger.info(
|
||||||
|
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
|
||||||
|
"(gap=%dpx, strip=%d/%d wbs)",
|
||||||
|
border_strip_removed, strip_side, z.get("zone_index", 0),
|
||||||
|
strip_gap, strip_count, total,
|
||||||
|
)
|
||||||
|
|
||||||
|
return border_prefiltered
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove decorative edge columns (alphabet sidebar safety net).
|
||||||
|
|
||||||
|
Dictionary pages have A-Z letter sidebars that OCR reads as single-
|
||||||
|
character word_boxes.
|
||||||
|
"""
|
||||||
|
for z in zones_data:
|
||||||
|
columns = z.get("columns", [])
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if len(columns) < 3 or not cells:
|
||||||
|
continue
|
||||||
|
col_cells: Dict[str, List[Dict]] = {}
|
||||||
|
for cell in cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if ct.startswith("column_"):
|
||||||
|
col_cells.setdefault(ct, []).append(cell)
|
||||||
|
col_types_ordered = sorted(col_cells.keys())
|
||||||
|
if len(col_types_ordered) < 3:
|
||||||
|
continue
|
||||||
|
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
||||||
|
edge_cells_list = col_cells.get(edge_ct, [])
|
||||||
|
if len(edge_cells_list) < 3:
|
||||||
|
continue
|
||||||
|
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
|
||||||
|
avg_len = sum(len(t) for t in texts) / len(texts)
|
||||||
|
single_char = sum(1 for t in texts if len(t) <= 1)
|
||||||
|
single_ratio = single_char / len(texts)
|
||||||
|
if avg_len > 1.5:
|
||||||
|
continue
|
||||||
|
if single_ratio < 0.7:
|
||||||
|
continue
|
||||||
|
removed_count = len(edge_cells_list)
|
||||||
|
edge_ids = {id(c) for c in edge_cells_list}
|
||||||
|
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
||||||
|
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
||||||
|
logger.info(
|
||||||
|
"Step 4f: removed decorative edge column '%s' from zone %d "
|
||||||
|
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
|
||||||
|
edge_ct, z.get("zone_index", 0), removed_count,
|
||||||
|
avg_len, single_ratio * 100,
|
||||||
|
)
|
||||||
|
break # only remove one edge per zone
|
||||||
@@ -0,0 +1,213 @@
|
|||||||
|
"""
|
||||||
|
Grid Build Core — the main _build_grid_core() function.
|
||||||
|
|
||||||
|
Extracted from grid_editor_api.py for maintainability.
|
||||||
|
Takes merged OCR word positions and builds a structured, zone-aware grid.
|
||||||
|
|
||||||
|
The function delegates to phase-specific modules:
|
||||||
|
- grid_build_zones.py — image loading, graphic/box detection, zone grids
|
||||||
|
- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
|
||||||
|
- grid_build_text_ops.py — color, headings, IPA, page refs
|
||||||
|
- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from grid.editor.filters import (
|
||||||
|
_flatten_word_boxes,
|
||||||
|
_get_content_bounds,
|
||||||
|
_filter_decorative_margin,
|
||||||
|
_filter_footer_words,
|
||||||
|
_filter_header_junk,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .zones import _build_zones
|
||||||
|
from .cleanup import _cleanup_zones
|
||||||
|
from .text_ops import _process_text
|
||||||
|
from .finalize import _finalize_grid
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def _build_grid_core(
|
||||||
|
session_id: str,
|
||||||
|
session: dict,
|
||||||
|
*,
|
||||||
|
ipa_mode: str = "auto",
|
||||||
|
syllable_mode: str = "auto",
|
||||||
|
enhance: bool = True,
|
||||||
|
max_columns: Optional[int] = None,
|
||||||
|
min_conf: Optional[int] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Core grid building logic — pure computation, no HTTP or DB side effects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_id: Session identifier (for logging and image loading).
|
||||||
|
session: Full session dict from get_session_db().
|
||||||
|
ipa_mode: "auto" (only when English headwords detected), "all"
|
||||||
|
(force IPA on all content columns), "en" (English column only),
|
||||||
|
"de" (German/definition columns only), or "none" (skip entirely).
|
||||||
|
syllable_mode: "auto" (only when original has pipe dividers),
|
||||||
|
"all" (force syllabification on all words), "en" (English only),
|
||||||
|
"de" (German only), or "none" (skip).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
StructuredGrid result dict.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If session data is incomplete.
|
||||||
|
"""
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# ── Phase 1: Input Validation & Word Filtering ──────────────────
|
||||||
|
|
||||||
|
# 1. Validate and load word results
|
||||||
|
word_result = session.get("word_result")
|
||||||
|
if not word_result or not word_result.get("cells"):
|
||||||
|
raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
|
||||||
|
|
||||||
|
img_w = word_result.get("image_width", 0)
|
||||||
|
img_h = word_result.get("image_height", 0)
|
||||||
|
if not img_w or not img_h:
|
||||||
|
raise ValueError("Missing image dimensions in word_result")
|
||||||
|
|
||||||
|
# 2. Flatten all word boxes from cells
|
||||||
|
all_words = _flatten_word_boxes(word_result["cells"])
|
||||||
|
if not all_words:
|
||||||
|
raise ValueError("No word boxes found in cells")
|
||||||
|
|
||||||
|
# 2a-pre. Apply min_conf filter if specified
|
||||||
|
if min_conf and min_conf > 0:
|
||||||
|
before = len(all_words)
|
||||||
|
all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
|
||||||
|
removed = before - len(all_words)
|
||||||
|
if removed:
|
||||||
|
logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
|
||||||
|
session_id, min_conf, removed, before)
|
||||||
|
|
||||||
|
logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
|
||||||
|
session_id, len(all_words), len(word_result["cells"]),
|
||||||
|
enhance, max_columns, min_conf)
|
||||||
|
|
||||||
|
# 2b. Filter decorative margin columns (alphabet graphics)
|
||||||
|
margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
|
||||||
|
margin_strip_detected = margin_strip_info.get("found", False)
|
||||||
|
|
||||||
|
# Read document_category from session
|
||||||
|
document_category = session.get("document_category")
|
||||||
|
|
||||||
|
# 2c. Filter footer rows (page numbers at the very bottom)
|
||||||
|
page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
|
||||||
|
|
||||||
|
# 2c2. Filter OCR junk from header illustrations
|
||||||
|
_filter_header_junk(all_words, img_h, logger, session_id)
|
||||||
|
|
||||||
|
# 2d. Filter words inside user-defined exclude regions
|
||||||
|
structure_result = session.get("structure_result")
|
||||||
|
exclude_rects = []
|
||||||
|
if structure_result:
|
||||||
|
for er in structure_result.get("exclude_regions", []):
|
||||||
|
exclude_rects.append({
|
||||||
|
"x": er["x"], "y": er["y"],
|
||||||
|
"w": er["w"], "h": er["h"],
|
||||||
|
})
|
||||||
|
if exclude_rects:
|
||||||
|
before = len(all_words)
|
||||||
|
filtered = []
|
||||||
|
for w in all_words:
|
||||||
|
w_cx = w["left"] + w.get("width", 0) / 2
|
||||||
|
w_cy = w["top"] + w.get("height", 0) / 2
|
||||||
|
inside = any(
|
||||||
|
er["x"] <= w_cx <= er["x"] + er["w"]
|
||||||
|
and er["y"] <= w_cy <= er["y"] + er["h"]
|
||||||
|
for er in exclude_rects
|
||||||
|
)
|
||||||
|
if not inside:
|
||||||
|
filtered.append(w)
|
||||||
|
removed = before - len(filtered)
|
||||||
|
if removed:
|
||||||
|
all_words = filtered
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: removed %d words inside %d user exclude region(s)",
|
||||||
|
session_id, removed, len(exclude_rects),
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2e. Hard-filter words inside graphic/image regions from structure step
|
||||||
|
graphic_rects: List[Dict[str, int]] = []
|
||||||
|
if structure_result:
|
||||||
|
for g in structure_result.get("graphics", []):
|
||||||
|
graphic_rects.append({
|
||||||
|
"x": g["x"], "y": g["y"],
|
||||||
|
"w": g["w"], "h": g["h"],
|
||||||
|
})
|
||||||
|
if graphic_rects:
|
||||||
|
before = len(all_words)
|
||||||
|
all_words = [
|
||||||
|
w for w in all_words
|
||||||
|
if not any(
|
||||||
|
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||||
|
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||||
|
for gr in graphic_rects
|
||||||
|
)
|
||||||
|
]
|
||||||
|
removed = before - len(all_words)
|
||||||
|
if removed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
|
||||||
|
session_id, removed, len(graphic_rects),
|
||||||
|
)
|
||||||
|
|
||||||
|
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
|
||||||
|
|
||||||
|
# ── Phase 2: Image Processing & Zone Detection ──────────────────
|
||||||
|
|
||||||
|
zone_result = await _build_zones(
|
||||||
|
session_id, session, all_words, graphic_rects,
|
||||||
|
content_x, content_y, content_w, content_h,
|
||||||
|
img_w, img_h,
|
||||||
|
)
|
||||||
|
zones_data = zone_result["zones_data"]
|
||||||
|
boxes_detected = zone_result["boxes_detected"]
|
||||||
|
recovered_count = zone_result["recovered_count"]
|
||||||
|
border_prefiltered = zone_result["border_prefiltered"]
|
||||||
|
img_bgr = zone_result["img_bgr"]
|
||||||
|
|
||||||
|
# ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
|
||||||
|
|
||||||
|
border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
|
||||||
|
|
||||||
|
# ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
|
||||||
|
|
||||||
|
text_result = _process_text(
|
||||||
|
zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
|
||||||
|
|
||||||
|
duration = time.time() - t0
|
||||||
|
|
||||||
|
result = _finalize_grid(
|
||||||
|
zones_data=zones_data,
|
||||||
|
all_words=all_words,
|
||||||
|
img_bgr=img_bgr,
|
||||||
|
img_w=img_w,
|
||||||
|
img_h=img_h,
|
||||||
|
session_id=session_id,
|
||||||
|
max_columns=max_columns,
|
||||||
|
ipa_mode=ipa_mode,
|
||||||
|
syllable_mode=syllable_mode,
|
||||||
|
en_col_type=text_result["en_col_type"],
|
||||||
|
ipa_target_cols=text_result["ipa_target_cols"],
|
||||||
|
all_content_cols=text_result["all_content_cols"],
|
||||||
|
skip_ipa=text_result["skip_ipa"],
|
||||||
|
document_category=document_category,
|
||||||
|
margin_strip_detected=margin_strip_detected,
|
||||||
|
page_number_info=text_result["page_number_info"],
|
||||||
|
boxes_detected=boxes_detected,
|
||||||
|
recovered_count=recovered_count,
|
||||||
|
duration=duration,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,452 @@
|
|||||||
|
"""
|
||||||
|
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
|
||||||
|
dictionary detection, syllable dividers, spell checking, empty column
|
||||||
|
removal, and result assembly.
|
||||||
|
|
||||||
|
Extracted from grid_build_core.py for maintainability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from .cell_ops import (
|
||||||
|
_remove_bullets_and_artifacts,
|
||||||
|
_remove_garbled_cells,
|
||||||
|
_normalize_word_order,
|
||||||
|
_enforce_max_columns,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _finalize_grid(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
all_words: List[Dict[str, Any]],
|
||||||
|
img_bgr: Any,
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
session_id: str,
|
||||||
|
max_columns: Optional[int],
|
||||||
|
ipa_mode: str,
|
||||||
|
syllable_mode: str,
|
||||||
|
en_col_type: Optional[str],
|
||||||
|
ipa_target_cols: set,
|
||||||
|
all_content_cols: set,
|
||||||
|
skip_ipa: bool,
|
||||||
|
document_category: Optional[str],
|
||||||
|
margin_strip_detected: bool,
|
||||||
|
page_number_info: Optional[Dict],
|
||||||
|
boxes_detected: int,
|
||||||
|
recovered_count: int,
|
||||||
|
duration: float,
|
||||||
|
) -> dict:
|
||||||
|
"""Run final processing steps and assemble result dict.
|
||||||
|
|
||||||
|
Handles: bullet removal, artifact cells, word ordering, max_columns,
|
||||||
|
dictionary detection, syllable dividers, spell check, empty columns,
|
||||||
|
internal flag cleanup, and result assembly.
|
||||||
|
"""
|
||||||
|
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||||
|
|
||||||
|
# 5i. Remove blue bullet/artifact word_boxes
|
||||||
|
_remove_bullets_and_artifacts(zones_data)
|
||||||
|
|
||||||
|
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
|
||||||
|
_remove_garbled_cells(zones_data)
|
||||||
|
|
||||||
|
# 5j. Normalise word_box order to reading order
|
||||||
|
_normalize_word_order(zones_data)
|
||||||
|
|
||||||
|
# 5k. Enforce max_columns by merging narrowest columns
|
||||||
|
if max_columns and max_columns > 0:
|
||||||
|
_enforce_max_columns(zones_data, max_columns)
|
||||||
|
|
||||||
|
# --- Dictionary detection on assembled grid ---
|
||||||
|
dict_detection = _detect_dictionary(
|
||||||
|
zones_data, img_w, img_h, document_category, margin_strip_detected
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Word-gap merge ---
|
||||||
|
try:
|
||||||
|
from cv_syllable_detect import merge_word_gaps_in_zones
|
||||||
|
merge_word_gaps_in_zones(zones_data, session_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Word-gap merge failed: %s", e)
|
||||||
|
|
||||||
|
# --- Pipe auto-correction ---
|
||||||
|
try:
|
||||||
|
from cv_syllable_detect import autocorrect_pipe_artifacts
|
||||||
|
autocorrect_pipe_artifacts(zones_data, session_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Pipe autocorrect failed: %s", e)
|
||||||
|
|
||||||
|
# --- Syllable divider insertion ---
|
||||||
|
syllable_insertions = _insert_syllable_dividers(
|
||||||
|
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
|
||||||
|
en_col_type, all_content_cols, total_cols,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Split merged words ---
|
||||||
|
_split_merged_words(zones_data, session_id)
|
||||||
|
|
||||||
|
# --- Ensure space before IPA/phonetic brackets ---
|
||||||
|
_fix_ipa_spacing(zones_data)
|
||||||
|
|
||||||
|
# --- SmartSpellChecker ---
|
||||||
|
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
|
||||||
|
|
||||||
|
# --- Debug log cell counts per column ---
|
||||||
|
for z in zones_data:
|
||||||
|
if z.get("zone_type") == "content":
|
||||||
|
from collections import Counter as _Counter
|
||||||
|
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
|
||||||
|
_cols = z.get("columns", [])
|
||||||
|
logger.info(
|
||||||
|
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
|
||||||
|
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Remove empty columns ---
|
||||||
|
_remove_empty_columns(zones_data)
|
||||||
|
|
||||||
|
# Clean up internal flags before returning
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
cell.pop("_ipa_corrected", None)
|
||||||
|
|
||||||
|
# 6. Build result
|
||||||
|
return _assemble_result(
|
||||||
|
zones_data, all_words, img_w, img_h, session_id,
|
||||||
|
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
|
||||||
|
dict_detection, page_number_info, boxes_detected,
|
||||||
|
recovered_count, duration, syllable_insertions,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_dictionary(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
document_category: Optional[str],
|
||||||
|
margin_strip_detected: bool,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Run dictionary detection on the assembled grid."""
|
||||||
|
from cv_layout import _score_dictionary_signals
|
||||||
|
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
|
||||||
|
try:
|
||||||
|
from cv_vocab_types import ColumnGeometry
|
||||||
|
for z in zones_data:
|
||||||
|
zone_cells = z.get("cells", [])
|
||||||
|
zone_cols = z.get("columns", [])
|
||||||
|
if len(zone_cols) < 2 or len(zone_cells) < 10:
|
||||||
|
continue
|
||||||
|
pseudo_geoms = []
|
||||||
|
for col in zone_cols:
|
||||||
|
ci = col["index"]
|
||||||
|
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
|
||||||
|
col_words = []
|
||||||
|
for cell in col_cells:
|
||||||
|
for wb in cell.get("word_boxes") or []:
|
||||||
|
col_words.append({
|
||||||
|
"text": wb.get("text", ""),
|
||||||
|
"conf": wb.get("conf", 0),
|
||||||
|
"top": wb.get("top", 0),
|
||||||
|
"left": wb.get("left", 0),
|
||||||
|
"height": wb.get("height", 0),
|
||||||
|
"width": wb.get("width", 0),
|
||||||
|
})
|
||||||
|
if not cell.get("word_boxes") and cell.get("text"):
|
||||||
|
col_words.append({
|
||||||
|
"text": cell["text"],
|
||||||
|
"conf": cell.get("confidence", 50),
|
||||||
|
"top": cell.get("bbox_px", {}).get("y", 0),
|
||||||
|
"left": cell.get("bbox_px", {}).get("x", 0),
|
||||||
|
"height": cell.get("bbox_px", {}).get("h", 20),
|
||||||
|
"width": cell.get("bbox_px", {}).get("w", 50),
|
||||||
|
})
|
||||||
|
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
|
||||||
|
pseudo_geoms.append(ColumnGeometry(
|
||||||
|
index=ci, x=col.get("x_min_px", 0), y=0,
|
||||||
|
width=max(col_w, 1), height=img_h,
|
||||||
|
word_count=len(col_words), words=col_words,
|
||||||
|
width_ratio=col_w / max(img_w, 1),
|
||||||
|
))
|
||||||
|
if len(pseudo_geoms) >= 2:
|
||||||
|
dd = _score_dictionary_signals(
|
||||||
|
pseudo_geoms,
|
||||||
|
document_category=document_category,
|
||||||
|
margin_strip_detected=margin_strip_detected,
|
||||||
|
)
|
||||||
|
if dd["confidence"] > dict_detection["confidence"]:
|
||||||
|
dict_detection = dd
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Dictionary detection failed: %s", e)
|
||||||
|
return dict_detection
|
||||||
|
|
||||||
|
|
||||||
|
def _insert_syllable_dividers(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
img_bgr: Any,
|
||||||
|
session_id: str,
|
||||||
|
syllable_mode: str,
|
||||||
|
dict_detection: Dict[str, Any],
|
||||||
|
en_col_type: Optional[str],
|
||||||
|
all_content_cols: set,
|
||||||
|
total_cols: int,
|
||||||
|
) -> int:
|
||||||
|
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
|
||||||
|
syllable_insertions = 0
|
||||||
|
if syllable_mode == "none" or img_bgr is None:
|
||||||
|
if syllable_mode == "none":
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
t = cell.get("text", "")
|
||||||
|
if "|" in t:
|
||||||
|
cell["text"] = t.replace("|", "")
|
||||||
|
return syllable_insertions
|
||||||
|
|
||||||
|
_syllable_eligible = False
|
||||||
|
if syllable_mode in ("all", "de", "en"):
|
||||||
|
_syllable_eligible = True
|
||||||
|
elif (dict_detection.get("is_dictionary")
|
||||||
|
and dict_detection.get("article_col_index") is not None):
|
||||||
|
_syllable_eligible = True
|
||||||
|
|
||||||
|
_syllable_col_filter: Optional[set] = None
|
||||||
|
if syllable_mode == "en":
|
||||||
|
_syllable_col_filter = {en_col_type} if en_col_type else set()
|
||||||
|
elif syllable_mode == "de":
|
||||||
|
if en_col_type and total_cols >= 3:
|
||||||
|
_syllable_col_filter = all_content_cols - {en_col_type}
|
||||||
|
|
||||||
|
if _syllable_eligible:
|
||||||
|
try:
|
||||||
|
from cv_syllable_detect import insert_syllable_dividers
|
||||||
|
force_syllables = (syllable_mode in ("all", "de", "en"))
|
||||||
|
syllable_insertions = insert_syllable_dividers(
|
||||||
|
zones_data, img_bgr, session_id,
|
||||||
|
force=force_syllables,
|
||||||
|
col_filter=_syllable_col_filter,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Syllable insertion failed: %s", e)
|
||||||
|
|
||||||
|
return syllable_insertions
|
||||||
|
|
||||||
|
|
||||||
|
def _split_merged_words(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
session_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Split merged words using dictionary lookup."""
|
||||||
|
try:
|
||||||
|
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
|
||||||
|
if not _SPELL_AVAILABLE:
|
||||||
|
return
|
||||||
|
split_count = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
parts = []
|
||||||
|
changed = False
|
||||||
|
for token in text.split():
|
||||||
|
clean = token
|
||||||
|
bracket_pos = clean.find('[')
|
||||||
|
suffix_ipa = ""
|
||||||
|
if bracket_pos > 0:
|
||||||
|
suffix_ipa = clean[bracket_pos:]
|
||||||
|
clean = clean[:bracket_pos]
|
||||||
|
suffix_punct = ""
|
||||||
|
stripped = clean.rstrip(".,!?;:'\")")
|
||||||
|
if stripped != clean:
|
||||||
|
suffix_punct = clean[len(stripped):]
|
||||||
|
clean = stripped
|
||||||
|
suffix = suffix_punct + suffix_ipa
|
||||||
|
contraction = ""
|
||||||
|
if "'" in clean and clean.index("'") >= 2:
|
||||||
|
apos_pos = clean.index("'")
|
||||||
|
contraction = clean[apos_pos:]
|
||||||
|
clean = clean[:apos_pos]
|
||||||
|
suffix = contraction + suffix
|
||||||
|
if len(clean) >= 4 and clean.isalpha():
|
||||||
|
split = _try_split_merged_word(clean)
|
||||||
|
if split:
|
||||||
|
parts.append(split + suffix)
|
||||||
|
changed = True
|
||||||
|
continue
|
||||||
|
parts.append(token)
|
||||||
|
if changed:
|
||||||
|
cell["text"] = " ".join(parts)
|
||||||
|
split_count += 1
|
||||||
|
if split_count:
|
||||||
|
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
|
||||||
|
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if text and "[" in text:
|
||||||
|
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
|
||||||
|
if fixed != text:
|
||||||
|
cell["text"] = fixed
|
||||||
|
|
||||||
|
|
||||||
|
def _run_spell_checker(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
session_id: str,
|
||||||
|
en_col_type: Optional[str],
|
||||||
|
total_cols: int,
|
||||||
|
) -> None:
|
||||||
|
"""Run SmartSpellChecker on all cells."""
|
||||||
|
try:
|
||||||
|
from smart_spell import SmartSpellChecker
|
||||||
|
_ssc = SmartSpellChecker()
|
||||||
|
spell_fix_count = 0
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if not text or not text.strip():
|
||||||
|
continue
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if total_cols >= 3 and en_col_type:
|
||||||
|
lang = "en" if ct == en_col_type else "de"
|
||||||
|
elif total_cols <= 2:
|
||||||
|
lang = "auto"
|
||||||
|
else:
|
||||||
|
lang = "auto"
|
||||||
|
|
||||||
|
result = _ssc.correct_text(text, lang=lang)
|
||||||
|
if result.changed:
|
||||||
|
cell["text"] = result.corrected
|
||||||
|
spell_fix_count += 1
|
||||||
|
|
||||||
|
if spell_fix_count:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: SmartSpellChecker fixed %d cells",
|
||||||
|
session_id, spell_fix_count,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
logger.debug("SmartSpellChecker not available in build-grid")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("SmartSpellChecker error in build-grid: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Remove columns that have no cells assigned."""
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
used_col_indices = {c.get("col_index") for c in cells}
|
||||||
|
old_cols = z.get("columns", [])
|
||||||
|
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
|
||||||
|
if len(new_cols) < len(old_cols):
|
||||||
|
old_to_new = {}
|
||||||
|
for new_i, col in enumerate(new_cols):
|
||||||
|
old_i = col.get("col_index", col.get("index", new_i))
|
||||||
|
old_to_new[old_i] = new_i
|
||||||
|
col["col_index"] = new_i
|
||||||
|
col["index"] = new_i
|
||||||
|
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
|
||||||
|
for cell in cells:
|
||||||
|
old_ci = cell.get("col_index", 0)
|
||||||
|
cell["col_index"] = old_to_new.get(old_ci, old_ci)
|
||||||
|
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
|
||||||
|
z["columns"] = new_cols
|
||||||
|
|
||||||
|
|
||||||
|
def _assemble_result(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
all_words: List[Dict[str, Any]],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
session_id: str,
|
||||||
|
ipa_mode: str,
|
||||||
|
syllable_mode: str,
|
||||||
|
ipa_target_cols: set,
|
||||||
|
skip_ipa: bool,
|
||||||
|
dict_detection: Dict[str, Any],
|
||||||
|
page_number_info: Optional[Dict],
|
||||||
|
boxes_detected: int,
|
||||||
|
recovered_count: int,
|
||||||
|
duration: float,
|
||||||
|
syllable_insertions: int,
|
||||||
|
) -> dict:
|
||||||
|
"""Build the final result dict (Phase 6)."""
|
||||||
|
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
|
||||||
|
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
|
||||||
|
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
|
||||||
|
|
||||||
|
# Collect color statistics
|
||||||
|
color_stats: Dict[str, int] = {}
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
for wb in cell.get("word_boxes", []):
|
||||||
|
cn = wb.get("color_name", "black")
|
||||||
|
color_stats[cn] = color_stats.get(cn, 0) + 1
|
||||||
|
|
||||||
|
# Compute layout metrics
|
||||||
|
all_content_row_heights: List[float] = []
|
||||||
|
for z in zones_data:
|
||||||
|
for row in z.get("rows", []):
|
||||||
|
if not row.get("is_header", False):
|
||||||
|
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
|
||||||
|
if h > 0:
|
||||||
|
all_content_row_heights.append(h)
|
||||||
|
avg_row_height = (
|
||||||
|
sum(all_content_row_heights) / len(all_content_row_heights)
|
||||||
|
if all_content_row_heights else 30.0
|
||||||
|
)
|
||||||
|
font_size_suggestion = max(10, int(avg_row_height * 0.6))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"image_width": img_w,
|
||||||
|
"image_height": img_h,
|
||||||
|
"zones": zones_data,
|
||||||
|
"boxes_detected": boxes_detected,
|
||||||
|
"summary": {
|
||||||
|
"total_zones": len(zones_data),
|
||||||
|
"total_columns": total_columns,
|
||||||
|
"total_rows": total_rows,
|
||||||
|
"total_cells": total_cells,
|
||||||
|
"total_words": len(all_words),
|
||||||
|
"recovered_colored": recovered_count,
|
||||||
|
"color_stats": color_stats,
|
||||||
|
},
|
||||||
|
"formatting": {
|
||||||
|
"bold_columns": [],
|
||||||
|
"header_rows": [],
|
||||||
|
},
|
||||||
|
"layout_metrics": {
|
||||||
|
"page_width_px": img_w,
|
||||||
|
"page_height_px": img_h,
|
||||||
|
"avg_row_height_px": round(avg_row_height, 1),
|
||||||
|
"font_size_suggestion_px": font_size_suggestion,
|
||||||
|
},
|
||||||
|
"dictionary_detection": {
|
||||||
|
"is_dictionary": dict_detection.get("is_dictionary", False),
|
||||||
|
"confidence": dict_detection.get("confidence", 0.0),
|
||||||
|
"signals": dict_detection.get("signals", {}),
|
||||||
|
"article_col_index": dict_detection.get("article_col_index"),
|
||||||
|
"headword_col_index": dict_detection.get("headword_col_index"),
|
||||||
|
},
|
||||||
|
"processing_modes": {
|
||||||
|
"ipa_mode": ipa_mode,
|
||||||
|
"syllable_mode": syllable_mode,
|
||||||
|
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
||||||
|
"syllables_applied": syllable_insertions > 0,
|
||||||
|
},
|
||||||
|
"page_number": page_number_info,
|
||||||
|
"duration_seconds": round(duration, 2),
|
||||||
|
}
|
||||||
@@ -0,0 +1,489 @@
|
|||||||
|
"""
|
||||||
|
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
|
||||||
|
parenthesis fix, IPA phonetic correction, page ref extraction, and
|
||||||
|
slash-IPA conversion.
|
||||||
|
|
||||||
|
Extracted from grid_build_core.py for maintainability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
|
from cv_color_detect import detect_word_colors
|
||||||
|
from cv_ocr_engines import (
|
||||||
|
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
|
||||||
|
_lookup_ipa,
|
||||||
|
)
|
||||||
|
from grid.editor.headers import (
|
||||||
|
_detect_heading_rows_by_color,
|
||||||
|
_detect_heading_rows_by_single_cell,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_text(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
img_bgr: Any,
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
ipa_mode: str,
|
||||||
|
page_number_info: Optional[Dict],
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Run color annotation, heading detection, IPA correction, and page refs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zones_data: List of zone dicts (modified in place).
|
||||||
|
img_bgr: BGR image array (or None).
|
||||||
|
img_w: Image width.
|
||||||
|
img_h: Image height.
|
||||||
|
ipa_mode: IPA processing mode.
|
||||||
|
page_number_info: Existing page number metadata (may be None).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
|
||||||
|
skip_ipa, page_number_info.
|
||||||
|
"""
|
||||||
|
# 5. Color annotation on final word_boxes in cells
|
||||||
|
if img_bgr is not None:
|
||||||
|
all_wb: List[Dict] = []
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
all_wb.extend(cell.get("word_boxes", []))
|
||||||
|
detect_word_colors(img_bgr, all_wb)
|
||||||
|
|
||||||
|
# 5a. Heading detection by color + height
|
||||||
|
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
|
||||||
|
if heading_count:
|
||||||
|
logger.info("Detected %d heading rows by color+height", heading_count)
|
||||||
|
|
||||||
|
# 5b. Fix unmatched parentheses in cell text
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if ")" in text and "(" not in text:
|
||||||
|
cell["text"] = "(" + text
|
||||||
|
|
||||||
|
# 5c. IPA phonetic correction
|
||||||
|
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||||
|
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||||
|
en_col_type = None
|
||||||
|
ipa_target_cols: set = set()
|
||||||
|
all_content_cols: set = set()
|
||||||
|
skip_ipa = (ipa_mode == "none")
|
||||||
|
|
||||||
|
# When ipa_mode=none, strip ALL square brackets from ALL content columns
|
||||||
|
if skip_ipa:
|
||||||
|
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
|
||||||
|
for cell in all_cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if "[" in text:
|
||||||
|
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
|
||||||
|
if stripped != text:
|
||||||
|
cell["text"] = stripped.strip()
|
||||||
|
cell["_ipa_corrected"] = True
|
||||||
|
|
||||||
|
if not skip_ipa and total_cols >= 3:
|
||||||
|
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
|
||||||
|
all_cells, total_cols, ipa_mode, zones_data
|
||||||
|
)
|
||||||
|
elif not skip_ipa:
|
||||||
|
# Collect all_content_cols even when <3 cols (needed by finalize)
|
||||||
|
for cell in all_cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||||
|
all_content_cols.add(ct)
|
||||||
|
|
||||||
|
# 5e. Heading detection by single-cell rows
|
||||||
|
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
|
||||||
|
if single_heading_count:
|
||||||
|
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
|
||||||
|
|
||||||
|
# 5f. Strip IPA from headings
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
if cell.get("col_type") != "heading":
|
||||||
|
continue
|
||||||
|
text = cell.get("text", "")
|
||||||
|
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
|
||||||
|
if stripped and stripped != text:
|
||||||
|
cell["text"] = stripped
|
||||||
|
|
||||||
|
# 5g. Extract page_ref cells and footer rows
|
||||||
|
_extract_page_refs_and_footers(zones_data, page_number_info)
|
||||||
|
|
||||||
|
# 5h. Convert slash-delimited IPA to bracket notation
|
||||||
|
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"en_col_type": en_col_type,
|
||||||
|
"ipa_target_cols": ipa_target_cols,
|
||||||
|
"all_content_cols": all_content_cols,
|
||||||
|
"skip_ipa": skip_ipa,
|
||||||
|
"page_number_info": page_number_info,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _run_ipa_correction(
|
||||||
|
all_cells: List[Dict],
|
||||||
|
total_cols: int,
|
||||||
|
ipa_mode: str,
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
) -> Tuple[Optional[str], set, set]:
|
||||||
|
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
|
||||||
|
en_col_type = None
|
||||||
|
all_content_cols: set = set()
|
||||||
|
|
||||||
|
# Detect English headword column via IPA signals
|
||||||
|
col_ipa_count: Dict[str, int] = {}
|
||||||
|
for cell in all_cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
txt = cell.get("text", "") or ""
|
||||||
|
if txt.strip():
|
||||||
|
all_content_cols.add(ct)
|
||||||
|
if '[' in txt or _text_has_garbled_ipa(txt):
|
||||||
|
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
|
||||||
|
if col_ipa_count:
|
||||||
|
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
|
||||||
|
elif ipa_mode == "all":
|
||||||
|
col_cell_count: Dict[str, int] = {}
|
||||||
|
for cell in all_cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||||
|
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
|
||||||
|
if col_cell_count:
|
||||||
|
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
||||||
|
|
||||||
|
# Decide which columns to process based on ipa_mode
|
||||||
|
en_ipa_target_cols: set = set()
|
||||||
|
de_ipa_target_cols: set = set()
|
||||||
|
if ipa_mode in ("auto", "en"):
|
||||||
|
if en_col_type:
|
||||||
|
en_ipa_target_cols.add(en_col_type)
|
||||||
|
elif ipa_mode == "de":
|
||||||
|
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||||
|
elif ipa_mode == "all":
|
||||||
|
if en_col_type:
|
||||||
|
en_ipa_target_cols.add(en_col_type)
|
||||||
|
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
|
||||||
|
|
||||||
|
# --- Strip IPA from columns NOT in the target set ---
|
||||||
|
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
|
||||||
|
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
|
||||||
|
if strip_en_ipa or ipa_mode == "none":
|
||||||
|
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
|
||||||
|
for cell in all_cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if ct not in strip_cols:
|
||||||
|
continue
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if "[" in text:
|
||||||
|
stripped = _SQUARE_BRACKET_RE.sub("", text)
|
||||||
|
if stripped != text:
|
||||||
|
cell["text"] = stripped.strip()
|
||||||
|
cell["_ipa_corrected"] = True
|
||||||
|
|
||||||
|
# --- English IPA (Britfone + eng_to_ipa) ---
|
||||||
|
if en_ipa_target_cols:
|
||||||
|
for cell in all_cells:
|
||||||
|
ct = cell.get("col_type")
|
||||||
|
if ct in en_ipa_target_cols:
|
||||||
|
cell["_orig_col_type"] = ct
|
||||||
|
cell["col_type"] = "column_en"
|
||||||
|
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||||
|
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||||
|
for cell in all_cells:
|
||||||
|
orig = cell.pop("_orig_col_type", None)
|
||||||
|
if orig:
|
||||||
|
cell["col_type"] = orig
|
||||||
|
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||||
|
cell["_ipa_corrected"] = True
|
||||||
|
|
||||||
|
# --- German IPA (wiki-pronunciation-dict + epitran) ---
|
||||||
|
if de_ipa_target_cols:
|
||||||
|
from cv_ipa_german import insert_german_ipa
|
||||||
|
insert_german_ipa(all_cells, de_ipa_target_cols)
|
||||||
|
|
||||||
|
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
|
||||||
|
|
||||||
|
# Mark cells whose text was changed by IPA correction
|
||||||
|
for cell in all_cells:
|
||||||
|
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||||
|
cell["_ipa_corrected"] = True
|
||||||
|
|
||||||
|
# 5d. Fix IPA continuation cells
|
||||||
|
skip_ipa = (ipa_mode == "none")
|
||||||
|
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||||
|
ipa_cont_fixed = 0
|
||||||
|
for z in ([] if skip_ipa else zones_data):
|
||||||
|
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||||
|
z_cells = z.get("cells", [])
|
||||||
|
for idx, row in enumerate(rows_sorted):
|
||||||
|
if idx == 0:
|
||||||
|
continue
|
||||||
|
ri = row["index"]
|
||||||
|
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
||||||
|
for cell in row_cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
cell_text = (cell.get("text") or "").strip()
|
||||||
|
if not cell_text:
|
||||||
|
wb_texts = [w.get("text", "")
|
||||||
|
for w in cell.get("word_boxes", [])]
|
||||||
|
cell_text = " ".join(wb_texts).strip()
|
||||||
|
if not cell_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_bracketed = (
|
||||||
|
cell_text.startswith('[') and cell_text.endswith(']')
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_bracketed:
|
||||||
|
if not _text_has_garbled_ipa(cell_text):
|
||||||
|
continue
|
||||||
|
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
content_cells_in_row = [
|
||||||
|
c for c in row_cells
|
||||||
|
if c.get("col_type", "").startswith("column_")
|
||||||
|
and c.get("col_type") != "column_1"
|
||||||
|
]
|
||||||
|
if len(content_cells_in_row) != 1:
|
||||||
|
continue
|
||||||
|
if not _text_has_garbled_ipa(cell_text):
|
||||||
|
continue
|
||||||
|
if any(c in _REAL_IPA_CHARS for c in cell_text):
|
||||||
|
continue
|
||||||
|
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
|
||||||
|
if len(_words_in_text) >= 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find headword in previous row, same column
|
||||||
|
prev_ri = rows_sorted[idx - 1]["index"]
|
||||||
|
prev_same_col = [
|
||||||
|
c for c in z_cells
|
||||||
|
if c.get("row_index") == prev_ri
|
||||||
|
and c.get("col_type") == ct
|
||||||
|
]
|
||||||
|
if not prev_same_col:
|
||||||
|
continue
|
||||||
|
prev_text = prev_same_col[0].get("text", "")
|
||||||
|
fixed = fix_ipa_continuation_cell(
|
||||||
|
cell_text, prev_text, pronunciation="british",
|
||||||
|
)
|
||||||
|
if fixed != cell_text:
|
||||||
|
cell["text"] = fixed
|
||||||
|
ipa_cont_fixed += 1
|
||||||
|
logger.info(
|
||||||
|
"IPA continuation R%d %s: '%s' -> '%s'",
|
||||||
|
ri, ct, cell_text, fixed,
|
||||||
|
)
|
||||||
|
if ipa_cont_fixed:
|
||||||
|
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
|
||||||
|
|
||||||
|
return en_col_type, ipa_target_cols, all_content_cols
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_page_refs_and_footers(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
page_number_info: Optional[Dict],
|
||||||
|
) -> None:
|
||||||
|
"""Extract page_ref cells and footer rows from content zones.
|
||||||
|
|
||||||
|
Modifies zones_data in place. Updates page_number_info if a page number
|
||||||
|
footer is found.
|
||||||
|
"""
|
||||||
|
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||||
|
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||||
|
_NUMBER_WORDS = {
|
||||||
|
"one", "two", "three", "four", "five", "six", "seven",
|
||||||
|
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
||||||
|
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
||||||
|
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
||||||
|
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
||||||
|
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
||||||
|
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
||||||
|
}
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
if z.get("zone_type") != "content":
|
||||||
|
continue
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
rows = z.get("rows", [])
|
||||||
|
if not rows:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract column_1 cells that look like page references
|
||||||
|
page_refs = []
|
||||||
|
page_ref_cell_ids = set()
|
||||||
|
for cell in cells:
|
||||||
|
if cell.get("col_type") != "column_1":
|
||||||
|
continue
|
||||||
|
text = (cell.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if not _PAGE_REF_RE.match(text):
|
||||||
|
continue
|
||||||
|
page_refs.append({
|
||||||
|
"row_index": cell.get("row_index"),
|
||||||
|
"text": text,
|
||||||
|
"bbox_pct": cell.get("bbox_pct", {}),
|
||||||
|
})
|
||||||
|
page_ref_cell_ids.add(cell.get("cell_id"))
|
||||||
|
|
||||||
|
# Detect footer: last non-header row if it has only 1 cell
|
||||||
|
footer_rows = []
|
||||||
|
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||||
|
if non_header_rows:
|
||||||
|
last_row = non_header_rows[-1]
|
||||||
|
last_ri = last_row["index"]
|
||||||
|
last_cells = [c for c in z["cells"]
|
||||||
|
if c.get("row_index") == last_ri]
|
||||||
|
if len(last_cells) == 1:
|
||||||
|
text = (last_cells[0].get("text") or "").strip()
|
||||||
|
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
||||||
|
has_commas = ',' in text
|
||||||
|
text_words = set(text.lower().split())
|
||||||
|
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
||||||
|
is_page_number = len(text) <= 20 or is_written_number
|
||||||
|
if (text and not has_real_ipa and not has_commas
|
||||||
|
and is_page_number
|
||||||
|
and last_cells[0].get("col_type") != "heading"):
|
||||||
|
footer_rows.append({
|
||||||
|
"row_index": last_ri,
|
||||||
|
"text": text,
|
||||||
|
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Classify footer rows
|
||||||
|
page_number_footers = []
|
||||||
|
other_footers = []
|
||||||
|
for fr in footer_rows:
|
||||||
|
ft = fr["text"].strip()
|
||||||
|
digits = "".join(c for c in ft if c.isdigit())
|
||||||
|
if digits and re.match(r'^[\d\s.]+$', ft):
|
||||||
|
page_number_footers.append(fr)
|
||||||
|
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
|
||||||
|
page_number_footers.append(fr)
|
||||||
|
else:
|
||||||
|
other_footers.append(fr)
|
||||||
|
|
||||||
|
# Remove page-number footer rows from grid entirely
|
||||||
|
if page_number_footers:
|
||||||
|
pn_ris = {fr["row_index"] for fr in page_number_footers}
|
||||||
|
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
|
||||||
|
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
|
||||||
|
pn_text = page_number_footers[0]["text"].strip()
|
||||||
|
pn_digits = "".join(c for c in pn_text if c.isdigit())
|
||||||
|
if not page_number_info:
|
||||||
|
page_number_info = {
|
||||||
|
"text": pn_text,
|
||||||
|
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
|
||||||
|
}
|
||||||
|
if pn_digits:
|
||||||
|
page_number_info["number"] = int(pn_digits)
|
||||||
|
|
||||||
|
# Mark remaining footer rows
|
||||||
|
if other_footers:
|
||||||
|
footer_ris = {fr["row_index"] for fr in other_footers}
|
||||||
|
for r in z["rows"]:
|
||||||
|
if r["index"] in footer_ris:
|
||||||
|
r["is_footer"] = True
|
||||||
|
for c in z["cells"]:
|
||||||
|
if c.get("row_index") in footer_ris:
|
||||||
|
c["col_type"] = "footer"
|
||||||
|
|
||||||
|
if page_refs or footer_rows:
|
||||||
|
logger.info(
|
||||||
|
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
|
||||||
|
len(page_refs), len(footer_rows), len(page_number_footers),
|
||||||
|
z.get("zone_index", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
if page_refs:
|
||||||
|
z["page_refs"] = page_refs
|
||||||
|
if other_footers:
|
||||||
|
z["footer"] = other_footers
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_slash_ipa(
|
||||||
|
zones_data: List[Dict[str, Any]],
|
||||||
|
skip_ipa: bool,
|
||||||
|
en_col_type: Optional[str],
|
||||||
|
) -> None:
|
||||||
|
"""Convert slash-delimited IPA to bracket notation.
|
||||||
|
|
||||||
|
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
||||||
|
"""
|
||||||
|
_SLASH_IPA_RE = re.compile(
|
||||||
|
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
|
||||||
|
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
|
||||||
|
)
|
||||||
|
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||||||
|
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
||||||
|
slash_ipa_fixed = 0
|
||||||
|
|
||||||
|
for z in ([] if skip_ipa else zones_data):
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
if en_col_type and cell.get("col_type") != en_col_type:
|
||||||
|
continue
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if "/" not in text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
def _replace_slash_ipa(m: re.Match) -> str:
|
||||||
|
nonlocal slash_ipa_fixed
|
||||||
|
headword = m.group(1)
|
||||||
|
ocr_ipa = m.group(2)
|
||||||
|
inner_raw = ocr_ipa.strip("/").strip()
|
||||||
|
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
||||||
|
return m.group(0)
|
||||||
|
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||||||
|
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||||||
|
if ipa:
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
return f"{headword} [{ipa}]"
|
||||||
|
inner = inner_raw.lstrip("'").strip()
|
||||||
|
if inner:
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
return f"{headword} [{inner}]"
|
||||||
|
return m.group(0)
|
||||||
|
|
||||||
|
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
||||||
|
|
||||||
|
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
||||||
|
|
||||||
|
def _replace_trailing_slash(m: re.Match) -> str:
|
||||||
|
nonlocal slash_ipa_fixed
|
||||||
|
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
||||||
|
if _SLASH_IPA_REJECT_RE.search(inner):
|
||||||
|
return m.group(0)
|
||||||
|
if inner:
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
return f" [{inner}]"
|
||||||
|
return m.group(0)
|
||||||
|
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
|
||||||
|
|
||||||
|
if new_text == text:
|
||||||
|
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||||||
|
if m:
|
||||||
|
inner = m.group(1).strip()
|
||||||
|
if not _SLASH_IPA_REJECT_RE.search(inner):
|
||||||
|
inner = inner.lstrip("'").strip()
|
||||||
|
if inner:
|
||||||
|
new_text = "[" + inner + "]" + text[m.end():]
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
|
||||||
|
if new_text != text:
|
||||||
|
cell["text"] = new_text
|
||||||
|
|
||||||
|
if slash_ipa_fixed:
|
||||||
|
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
|
||||||
@@ -0,0 +1,464 @@
|
|||||||
|
"""
|
||||||
|
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
|
||||||
|
detection and zone-aware grid building.
|
||||||
|
|
||||||
|
Extracted from grid_build_core.py for maintainability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||||
|
from cv_graphic_detect import detect_graphic_elements
|
||||||
|
from cv_color_detect import recover_colored_text
|
||||||
|
from cv_vocab_types import PageZone
|
||||||
|
from ocr_pipeline_session_store import get_session_image
|
||||||
|
|
||||||
|
from grid.editor.filters import (
|
||||||
|
_filter_border_strip_words,
|
||||||
|
_filter_border_ghosts,
|
||||||
|
_words_in_zone,
|
||||||
|
)
|
||||||
|
from grid.editor.zones import (
|
||||||
|
_PIPE_RE_VSPLIT,
|
||||||
|
_detect_vertical_dividers,
|
||||||
|
_split_zone_at_vertical_dividers,
|
||||||
|
_merge_content_zones_across_boxes,
|
||||||
|
_build_zone_grid,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def _build_zones(
|
||||||
|
session_id: str,
|
||||||
|
session: dict,
|
||||||
|
all_words: List[Dict[str, Any]],
|
||||||
|
graphic_rects: List[Dict[str, int]],
|
||||||
|
content_x: int,
|
||||||
|
content_y: int,
|
||||||
|
content_w: int,
|
||||||
|
content_h: int,
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Load image, detect graphics/boxes, build zone-aware grids.
|
||||||
|
|
||||||
|
Returns a dict with keys:
|
||||||
|
zones_data, boxes_detected, recovered_count, border_prefiltered,
|
||||||
|
img_bgr, all_words (modified in-place but returned for clarity).
|
||||||
|
"""
|
||||||
|
zones_data: List[Dict[str, Any]] = []
|
||||||
|
boxes_detected = 0
|
||||||
|
recovered_count = 0
|
||||||
|
border_prefiltered = False
|
||||||
|
img_bgr = None
|
||||||
|
|
||||||
|
# 3. Load image for box detection
|
||||||
|
img_png = await get_session_image(session_id, "cropped")
|
||||||
|
if not img_png:
|
||||||
|
img_png = await get_session_image(session_id, "dewarped")
|
||||||
|
if not img_png:
|
||||||
|
img_png = await get_session_image(session_id, "original")
|
||||||
|
|
||||||
|
if img_png:
|
||||||
|
# Decode image for color detection + box detection
|
||||||
|
arr = np.frombuffer(img_png, dtype=np.uint8)
|
||||||
|
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
||||||
|
|
||||||
|
if img_bgr is not None:
|
||||||
|
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
|
||||||
|
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
|
||||||
|
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
|
||||||
|
if fresh_graphics:
|
||||||
|
fresh_rects = [
|
||||||
|
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
|
||||||
|
for g in fresh_graphics
|
||||||
|
]
|
||||||
|
graphic_rects.extend(fresh_rects)
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: detected %d graphic region(s) via CV",
|
||||||
|
session_id, len(fresh_graphics),
|
||||||
|
)
|
||||||
|
# Hard-filter words inside newly detected graphic regions
|
||||||
|
before = len(all_words)
|
||||||
|
all_words[:] = [
|
||||||
|
w for w in all_words
|
||||||
|
if not any(
|
||||||
|
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||||
|
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||||
|
for gr in fresh_rects
|
||||||
|
)
|
||||||
|
]
|
||||||
|
removed = before - len(all_words)
|
||||||
|
if removed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
|
||||||
|
session_id, removed, len(fresh_rects),
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Recover colored text that OCR missed (before grid building) ---
|
||||||
|
recovered = recover_colored_text(img_bgr, all_words)
|
||||||
|
if recovered and graphic_rects:
|
||||||
|
# Filter recovered chars inside graphic regions
|
||||||
|
recovered = [
|
||||||
|
r for r in recovered
|
||||||
|
if not any(
|
||||||
|
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||||
|
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||||
|
for gr in graphic_rects
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if recovered:
|
||||||
|
recovered_count = len(recovered)
|
||||||
|
all_words.extend(recovered)
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: +%d recovered colored words",
|
||||||
|
session_id, recovered_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect bordered boxes
|
||||||
|
boxes = detect_boxes(
|
||||||
|
img_bgr,
|
||||||
|
content_x=content_x,
|
||||||
|
content_w=content_w,
|
||||||
|
content_y=content_y,
|
||||||
|
content_h=content_h,
|
||||||
|
)
|
||||||
|
boxes_detected = len(boxes)
|
||||||
|
|
||||||
|
if boxes:
|
||||||
|
# Filter border ghost words before grid building
|
||||||
|
all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
|
||||||
|
if ghost_count:
|
||||||
|
all_words[:] = all_words_new
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: removed %d border ghost words",
|
||||||
|
session_id, ghost_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split page into zones
|
||||||
|
page_zones = split_page_into_zones(
|
||||||
|
content_x, content_y, content_w, content_h, boxes
|
||||||
|
)
|
||||||
|
|
||||||
|
# Merge content zones separated by box zones
|
||||||
|
page_zones = _merge_content_zones_across_boxes(
|
||||||
|
page_zones, content_x, content_w
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3b. Detect vertical dividers and split content zones
|
||||||
|
page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
|
||||||
|
page_zones, all_words
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- First pass: build grids per zone independently ---
|
||||||
|
zone_grids = _build_grids_per_zone(
|
||||||
|
page_zones, all_words, img_w, img_h
|
||||||
|
)
|
||||||
|
border_prefiltered = border_prefiltered or any(
|
||||||
|
zg.get("_border_prefiltered") for zg in zone_grids
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Second pass: merge column boundaries from all content zones ---
|
||||||
|
_merge_content_zone_columns(
|
||||||
|
zone_grids, all_words, content_w, img_w, img_h, session_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Build zones_data from zone_grids ---
|
||||||
|
for zg in zone_grids:
|
||||||
|
pz = zg["pz"]
|
||||||
|
grid = zg["grid"]
|
||||||
|
grid.pop("_raw_columns", None)
|
||||||
|
|
||||||
|
zone_entry: Dict[str, Any] = {
|
||||||
|
"zone_index": pz.index,
|
||||||
|
"zone_type": pz.zone_type,
|
||||||
|
"bbox_px": {
|
||||||
|
"x": pz.x, "y": pz.y,
|
||||||
|
"w": pz.width, "h": pz.height,
|
||||||
|
},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(pz.x / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(pz.y / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round(pz.width / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"border": None,
|
||||||
|
"word_count": len(zg["words"]),
|
||||||
|
**grid,
|
||||||
|
}
|
||||||
|
|
||||||
|
if pz.box:
|
||||||
|
zone_entry["border"] = {
|
||||||
|
"thickness": pz.box.border_thickness,
|
||||||
|
"confidence": pz.box.confidence,
|
||||||
|
}
|
||||||
|
|
||||||
|
if pz.image_overlays:
|
||||||
|
zone_entry["image_overlays"] = pz.image_overlays
|
||||||
|
|
||||||
|
if pz.layout_hint:
|
||||||
|
zone_entry["layout_hint"] = pz.layout_hint
|
||||||
|
if pz.vsplit_group is not None:
|
||||||
|
zone_entry["vsplit_group"] = pz.vsplit_group
|
||||||
|
|
||||||
|
zones_data.append(zone_entry)
|
||||||
|
|
||||||
|
# 4. Fallback: no boxes detected -> single zone with all words
|
||||||
|
if not zones_data:
|
||||||
|
before = len(all_words)
|
||||||
|
filtered_words = [
|
||||||
|
w for w in all_words
|
||||||
|
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
|
||||||
|
]
|
||||||
|
removed = before - len(filtered_words)
|
||||||
|
if removed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
||||||
|
session_id, removed,
|
||||||
|
)
|
||||||
|
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
|
||||||
|
if bs_removed:
|
||||||
|
border_prefiltered = True
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: pre-filtered %d border-strip words",
|
||||||
|
session_id, bs_removed,
|
||||||
|
)
|
||||||
|
grid = _build_zone_grid(
|
||||||
|
filtered_words, content_x, content_y, content_w, content_h,
|
||||||
|
0, img_w, img_h,
|
||||||
|
)
|
||||||
|
grid.pop("_raw_columns", None)
|
||||||
|
zones_data.append({
|
||||||
|
"zone_index": 0,
|
||||||
|
"zone_type": "content",
|
||||||
|
"bbox_px": {
|
||||||
|
"x": content_x, "y": content_y,
|
||||||
|
"w": content_w, "h": content_h,
|
||||||
|
},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(content_x / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(content_y / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round(content_w / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round(content_h / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"border": None,
|
||||||
|
"word_count": len(all_words),
|
||||||
|
**grid,
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"zones_data": zones_data,
|
||||||
|
"boxes_detected": boxes_detected,
|
||||||
|
"recovered_count": recovered_count,
|
||||||
|
"border_prefiltered": border_prefiltered,
|
||||||
|
"img_bgr": img_bgr,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_and_split_vertical_dividers(
|
||||||
|
page_zones: List[PageZone],
|
||||||
|
all_words: List[Dict[str, Any]],
|
||||||
|
) -> tuple:
|
||||||
|
"""Detect vertical dividers and split content zones.
|
||||||
|
|
||||||
|
Returns (expanded_zones, border_prefiltered_from_vsplit).
|
||||||
|
"""
|
||||||
|
vsplit_group_counter = 0
|
||||||
|
expanded_zones: List = []
|
||||||
|
for pz in page_zones:
|
||||||
|
if pz.zone_type != "content":
|
||||||
|
expanded_zones.append(pz)
|
||||||
|
continue
|
||||||
|
zone_words = _words_in_zone(
|
||||||
|
all_words, pz.y, pz.height, pz.x, pz.width
|
||||||
|
)
|
||||||
|
divider_xs = _detect_vertical_dividers(
|
||||||
|
zone_words, pz.x, pz.width, pz.y, pz.height
|
||||||
|
)
|
||||||
|
if divider_xs:
|
||||||
|
sub_zones = _split_zone_at_vertical_dividers(
|
||||||
|
pz, divider_xs, vsplit_group_counter
|
||||||
|
)
|
||||||
|
expanded_zones.extend(sub_zones)
|
||||||
|
vsplit_group_counter += 1
|
||||||
|
# Remove pipe words so they don't appear in sub-zones
|
||||||
|
pipe_ids = set(
|
||||||
|
id(w) for w in zone_words
|
||||||
|
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
||||||
|
)
|
||||||
|
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: vertical split zone %d at x=%s -> %d sub-zones",
|
||||||
|
pz.index, [int(x) for x in divider_xs], len(sub_zones),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
expanded_zones.append(pz)
|
||||||
|
# Re-index zones
|
||||||
|
for i, pz in enumerate(expanded_zones):
|
||||||
|
pz.index = i
|
||||||
|
return expanded_zones, False
|
||||||
|
|
||||||
|
|
||||||
|
def _build_grids_per_zone(
|
||||||
|
page_zones: List[PageZone],
|
||||||
|
all_words: List[Dict[str, Any]],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Build grids for each zone independently (first pass)."""
|
||||||
|
zone_grids: List[Dict] = []
|
||||||
|
|
||||||
|
for pz in page_zones:
|
||||||
|
zone_words = _words_in_zone(
|
||||||
|
all_words, pz.y, pz.height, pz.x, pz.width
|
||||||
|
)
|
||||||
|
if pz.zone_type == "content":
|
||||||
|
logger.info(
|
||||||
|
"build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
|
||||||
|
pz.index, pz.zone_type,
|
||||||
|
pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
|
||||||
|
len(zone_words), len(all_words),
|
||||||
|
)
|
||||||
|
# Filter recovered single-char artifacts in ALL zones
|
||||||
|
before = len(zone_words)
|
||||||
|
zone_words = [
|
||||||
|
w for w in zone_words
|
||||||
|
if not (
|
||||||
|
w.get("recovered")
|
||||||
|
and len(w.get("text", "").strip()) <= 2
|
||||||
|
)
|
||||||
|
]
|
||||||
|
removed = before - len(zone_words)
|
||||||
|
if removed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||||
|
removed, pz.zone_type, pz.index,
|
||||||
|
)
|
||||||
|
# Filter words inside image overlay regions (merged box zones)
|
||||||
|
if pz.image_overlays:
|
||||||
|
before_ov = len(zone_words)
|
||||||
|
zone_words = [
|
||||||
|
w for w in zone_words
|
||||||
|
if not any(
|
||||||
|
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
||||||
|
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
||||||
|
for ov in pz.image_overlays
|
||||||
|
)
|
||||||
|
]
|
||||||
|
ov_removed = before_ov - len(zone_words)
|
||||||
|
if ov_removed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid: filtered %d words inside image overlays from zone %d",
|
||||||
|
ov_removed, pz.index,
|
||||||
|
)
|
||||||
|
zone_words, bs_removed = _filter_border_strip_words(zone_words)
|
||||||
|
bp = False
|
||||||
|
if bs_removed:
|
||||||
|
bp = True
|
||||||
|
logger.info(
|
||||||
|
"build-grid: pre-filtered %d border-strip words from zone %d",
|
||||||
|
bs_removed, pz.index,
|
||||||
|
)
|
||||||
|
grid = _build_zone_grid(
|
||||||
|
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||||
|
pz.index, img_w, img_h,
|
||||||
|
skip_first_row_header=bool(pz.image_overlays),
|
||||||
|
)
|
||||||
|
zone_grids.append({
|
||||||
|
"pz": pz, "words": zone_words, "grid": grid,
|
||||||
|
"_border_prefiltered": bp,
|
||||||
|
})
|
||||||
|
|
||||||
|
return zone_grids
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_content_zone_columns(
|
||||||
|
zone_grids: List[Dict[str, Any]],
|
||||||
|
all_words: List[Dict[str, Any]],
|
||||||
|
content_w: int,
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
session_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Second pass: merge column boundaries from all content zones.
|
||||||
|
|
||||||
|
Modifies zone_grids in place.
|
||||||
|
"""
|
||||||
|
content_zones = [
|
||||||
|
zg for zg in zone_grids
|
||||||
|
if zg["pz"].zone_type == "content"
|
||||||
|
and zg["pz"].vsplit_group is None
|
||||||
|
]
|
||||||
|
if len(content_zones) <= 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Collect column split points (x_min of non-first columns)
|
||||||
|
all_split_xs: List[float] = []
|
||||||
|
for zg in content_zones:
|
||||||
|
raw_cols = zg["grid"].get("_raw_columns", [])
|
||||||
|
for col in raw_cols[1:]:
|
||||||
|
all_split_xs.append(col["x_min"])
|
||||||
|
|
||||||
|
if not all_split_xs:
|
||||||
|
return
|
||||||
|
|
||||||
|
all_split_xs.sort()
|
||||||
|
merge_distance = max(25, int(content_w * 0.03))
|
||||||
|
merged_xs = [all_split_xs[0]]
|
||||||
|
for x in all_split_xs[1:]:
|
||||||
|
if x - merged_xs[-1] < merge_distance:
|
||||||
|
merged_xs[-1] = (merged_xs[-1] + x) / 2
|
||||||
|
else:
|
||||||
|
merged_xs.append(x)
|
||||||
|
|
||||||
|
total_cols = len(merged_xs) + 1
|
||||||
|
max_zone_cols = max(
|
||||||
|
len(zg["grid"].get("_raw_columns", []))
|
||||||
|
for zg in content_zones
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_cols < max_zone_cols:
|
||||||
|
return
|
||||||
|
|
||||||
|
cx_min = min(w["left"] for w in all_words)
|
||||||
|
cx_max = max(w["left"] + w["width"] for w in all_words)
|
||||||
|
merged_columns: List[Dict[str, Any]] = []
|
||||||
|
prev_x = cx_min
|
||||||
|
for i, sx in enumerate(merged_xs):
|
||||||
|
merged_columns.append({
|
||||||
|
"index": i,
|
||||||
|
"type": f"column_{i + 1}",
|
||||||
|
"x_min": prev_x,
|
||||||
|
"x_max": sx,
|
||||||
|
})
|
||||||
|
prev_x = sx
|
||||||
|
merged_columns.append({
|
||||||
|
"index": len(merged_xs),
|
||||||
|
"type": f"column_{len(merged_xs) + 1}",
|
||||||
|
"x_min": prev_x,
|
||||||
|
"x_max": cx_max,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Re-build ALL content zones with merged columns
|
||||||
|
for zg in zone_grids:
|
||||||
|
pz = zg["pz"]
|
||||||
|
if pz.zone_type == "content":
|
||||||
|
grid = _build_zone_grid(
|
||||||
|
zg["words"], pz.x, pz.y,
|
||||||
|
pz.width, pz.height,
|
||||||
|
pz.index, img_w, img_h,
|
||||||
|
global_columns=merged_columns,
|
||||||
|
skip_first_row_header=bool(pz.image_overlays),
|
||||||
|
)
|
||||||
|
zg["grid"] = grid
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: union of %d content "
|
||||||
|
"zones -> %d merged columns (max single zone: %d)",
|
||||||
|
session_id, len(content_zones),
|
||||||
|
total_cols, max_zone_cols,
|
||||||
|
)
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor sub-package — FastAPI endpoints and helper functions.
|
||||||
|
|
||||||
|
Modules:
|
||||||
|
- api — barrel re-export (combined router + _build_grid_core)
|
||||||
|
- api_grid — build-grid, save-grid, get-grid endpoints
|
||||||
|
- api_gutter — gutter-repair endpoints
|
||||||
|
- api_box — build-box-grids endpoints
|
||||||
|
- api_unified — build-unified-grid endpoints
|
||||||
|
- helpers — barrel re-export of all helper symbols
|
||||||
|
- columns — column detection, cross-column splitting
|
||||||
|
- filters — word/zone filtering, border ghosts
|
||||||
|
- headers — header/heading detection, colspan detection
|
||||||
|
- zones — vertical dividers, zone splitting/merging
|
||||||
|
"""
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor API — barrel re-export.
|
||||||
|
|
||||||
|
The actual endpoints live in:
|
||||||
|
- grid_editor_api_grid.py (build-grid, rerun-ocr, save-grid, get-grid)
|
||||||
|
- grid_editor_api_gutter.py (gutter-repair, gutter-repair/apply)
|
||||||
|
- grid_editor_api_box.py (build-box-grids)
|
||||||
|
- grid_editor_api_unified.py (build-unified-grid, unified-grid)
|
||||||
|
|
||||||
|
This module re-exports the combined router and key symbols so that
|
||||||
|
existing `from grid_editor_api import router` / `from grid_editor_api import _build_grid_core`
|
||||||
|
continue to work unchanged.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from .api_grid import router as _grid_router
|
||||||
|
from .api_gutter import router as _gutter_router
|
||||||
|
from .api_box import router as _box_router
|
||||||
|
from .api_unified import router as _unified_router
|
||||||
|
|
||||||
|
# Re-export _build_grid_core so callers that do
|
||||||
|
# `from grid_editor_api import _build_grid_core` keep working.
|
||||||
|
from grid.build.core import _build_grid_core # noqa: F401
|
||||||
|
|
||||||
|
# Merge all sub-routers into one combined router
|
||||||
|
router = APIRouter()
|
||||||
|
router.include_router(_grid_router)
|
||||||
|
router.include_router(_gutter_router)
|
||||||
|
router.include_router(_box_router)
|
||||||
|
router.include_router(_unified_router)
|
||||||
@@ -0,0 +1,177 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor API — box-grid-review endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Request
|
||||||
|
|
||||||
|
from .filters import _words_in_zone
|
||||||
|
from ocr_pipeline_session_store import (
|
||||||
|
get_session_db,
|
||||||
|
update_session_db,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/build-box-grids")
|
||||||
|
async def build_box_grids(session_id: str, request: Request):
|
||||||
|
"""Rebuild grid structure for all detected boxes with layout-aware detection.
|
||||||
|
|
||||||
|
Uses structure_result.boxes (from Step 7) as the source of box coordinates,
|
||||||
|
and raw_paddle_words as OCR word source. Creates or updates box zones in
|
||||||
|
the grid_editor_result.
|
||||||
|
|
||||||
|
Optional body: { "overrides": { "0": "bullet_list" } }
|
||||||
|
Maps box_index -> forced layout_type.
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
grid_data = session.get("grid_editor_result")
|
||||||
|
if not grid_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
|
||||||
|
|
||||||
|
# Get raw OCR words (with top/left/width/height keys)
|
||||||
|
word_result = session.get("word_result") or {}
|
||||||
|
all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or []
|
||||||
|
if not all_words:
|
||||||
|
raise HTTPException(status_code=400, detail="No raw OCR words available.")
|
||||||
|
|
||||||
|
# Get detected boxes from structure_result
|
||||||
|
structure_result = session.get("structure_result") or {}
|
||||||
|
gt = session.get("ground_truth") or {}
|
||||||
|
if not structure_result:
|
||||||
|
structure_result = gt.get("structure_result") or {}
|
||||||
|
detected_boxes = structure_result.get("boxes") or []
|
||||||
|
if not detected_boxes:
|
||||||
|
return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
|
||||||
|
|
||||||
|
# Filter out false-positive boxes in header/footer margins.
|
||||||
|
img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
|
||||||
|
if img_h_for_filter > 0:
|
||||||
|
margin_frac = 0.07 # 7% of image height
|
||||||
|
margin_top = img_h_for_filter * margin_frac
|
||||||
|
margin_bottom = img_h_for_filter * (1 - margin_frac)
|
||||||
|
filtered = []
|
||||||
|
for box in detected_boxes:
|
||||||
|
by = box.get("y", 0)
|
||||||
|
bh = box.get("h", 0)
|
||||||
|
box_center_y = by + bh / 2
|
||||||
|
if box_center_y < margin_top or box_center_y > margin_bottom:
|
||||||
|
logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)",
|
||||||
|
by, bh, box_center_y, margin_top, margin_bottom)
|
||||||
|
continue
|
||||||
|
filtered.append(box)
|
||||||
|
detected_boxes = filtered
|
||||||
|
|
||||||
|
body = {}
|
||||||
|
try:
|
||||||
|
body = await request.json()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
layout_overrides = body.get("overrides", {})
|
||||||
|
|
||||||
|
from cv_box_layout import build_box_zone_grid
|
||||||
|
|
||||||
|
img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0)
|
||||||
|
img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
|
||||||
|
|
||||||
|
zones = grid_data.get("zones", [])
|
||||||
|
|
||||||
|
# Find highest existing zone_index
|
||||||
|
max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1)
|
||||||
|
|
||||||
|
# Remove old box zones (we'll rebuild them)
|
||||||
|
zones = [z for z in zones if z.get("zone_type") != "box"]
|
||||||
|
|
||||||
|
box_count = 0
|
||||||
|
spell_fixes = 0
|
||||||
|
|
||||||
|
for box_idx, box in enumerate(detected_boxes):
|
||||||
|
bx = box.get("x", 0)
|
||||||
|
by = box.get("y", 0)
|
||||||
|
bw = box.get("w", 0)
|
||||||
|
bh = box.get("h", 0)
|
||||||
|
|
||||||
|
if bw <= 0 or bh <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filter raw OCR words inside this box
|
||||||
|
zone_words = _words_in_zone(all_words, by, bh, bx, bw)
|
||||||
|
if not zone_words:
|
||||||
|
logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh)
|
||||||
|
continue
|
||||||
|
|
||||||
|
zone_idx = max_zone_idx + 1 + box_idx
|
||||||
|
forced_layout = layout_overrides.get(str(box_idx))
|
||||||
|
|
||||||
|
# Build box grid
|
||||||
|
box_grid = build_box_zone_grid(
|
||||||
|
zone_words, bx, by, bw, bh,
|
||||||
|
zone_idx, img_w, img_h,
|
||||||
|
layout_type=forced_layout,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply SmartSpellChecker to all box cells
|
||||||
|
try:
|
||||||
|
from smart_spell import SmartSpellChecker
|
||||||
|
ssc = SmartSpellChecker()
|
||||||
|
for cell in box_grid.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
result = ssc.correct_text(text, lang="auto")
|
||||||
|
if result.changed:
|
||||||
|
cell["text"] = result.corrected
|
||||||
|
spell_fixes += 1
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Build zone entry
|
||||||
|
zone_entry = {
|
||||||
|
"zone_index": zone_idx,
|
||||||
|
"zone_type": "box",
|
||||||
|
"bbox_px": {"x": bx, "y": by, "w": bw, "h": bh},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(bx / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(by / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round(bw / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round(bh / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"border": None,
|
||||||
|
"word_count": len(zone_words),
|
||||||
|
"columns": box_grid["columns"],
|
||||||
|
"rows": box_grid["rows"],
|
||||||
|
"cells": box_grid["cells"],
|
||||||
|
"header_rows": box_grid.get("header_rows", []),
|
||||||
|
"box_layout_type": box_grid.get("box_layout_type", "flowing"),
|
||||||
|
"box_grid_reviewed": False,
|
||||||
|
"box_bg_color": box.get("bg_color_name", ""),
|
||||||
|
"box_bg_hex": box.get("bg_color_hex", ""),
|
||||||
|
}
|
||||||
|
zones.append(zone_entry)
|
||||||
|
box_count += 1
|
||||||
|
|
||||||
|
# Sort zones by y-position for correct reading order
|
||||||
|
zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0))
|
||||||
|
|
||||||
|
grid_data["zones"] = zones
|
||||||
|
await update_session_db(session_id, grid_editor_result=grid_data)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected",
|
||||||
|
session_id, box_count, spell_fixes, len(detected_boxes),
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"box_zones_rebuilt": box_count,
|
||||||
|
"total_detected_boxes": len(detected_boxes),
|
||||||
|
"spell_fixes": spell_fixes,
|
||||||
|
"zones": zones,
|
||||||
|
}
|
||||||
@@ -0,0 +1,334 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor API — grid build, save, and retrieve endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Query, Request
|
||||||
|
|
||||||
|
from grid.build.core import _build_grid_core
|
||||||
|
from ocr_pipeline_session_store import (
|
||||||
|
get_session_db,
|
||||||
|
update_session_db,
|
||||||
|
)
|
||||||
|
from ocr_pipeline_common import (
|
||||||
|
_cache,
|
||||||
|
_load_session_to_cache,
|
||||||
|
_get_cached,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/build-grid")
|
||||||
|
async def build_grid(
|
||||||
|
session_id: str,
|
||||||
|
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
|
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
|
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
|
||||||
|
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
|
||||||
|
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
|
||||||
|
):
|
||||||
|
"""Build a structured, zone-aware grid from existing Kombi word results.
|
||||||
|
|
||||||
|
Requires that paddle-kombi or rapid-kombi has already been run on the session.
|
||||||
|
Uses the image for box detection and the word positions for grid structuring.
|
||||||
|
|
||||||
|
Query params:
|
||||||
|
ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip)
|
||||||
|
syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip)
|
||||||
|
|
||||||
|
Returns a StructuredGrid with zones, each containing their own
|
||||||
|
columns, rows, and cells — ready for the frontend Excel-like editor.
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await _build_grid_core(
|
||||||
|
session_id, session,
|
||||||
|
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||||
|
enhance=enhance,
|
||||||
|
max_columns=max_cols if max_cols > 0 else None,
|
||||||
|
min_conf=min_conf if min_conf > 0 else None,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
# Save automatic grid snapshot for later comparison with manual corrections
|
||||||
|
# Lazy import to avoid circular dependency with ocr_pipeline_regression
|
||||||
|
from ocr_pipeline_regression import _build_reference_snapshot
|
||||||
|
|
||||||
|
wr = session.get("word_result") or {}
|
||||||
|
engine = wr.get("ocr_engine", "")
|
||||||
|
if engine in ("kombi", "rapid_kombi"):
|
||||||
|
auto_pipeline = "kombi"
|
||||||
|
elif engine == "paddle_direct":
|
||||||
|
auto_pipeline = "paddle-direct"
|
||||||
|
else:
|
||||||
|
auto_pipeline = "pipeline"
|
||||||
|
auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline)
|
||||||
|
|
||||||
|
gt = session.get("ground_truth") or {}
|
||||||
|
gt["auto_grid_snapshot"] = auto_snapshot
|
||||||
|
|
||||||
|
# Persist to DB and advance current_step to 11 (reconstruction complete)
|
||||||
|
await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: %d zones, %d cols, %d rows, %d cells, "
|
||||||
|
"%d boxes in %.2fs",
|
||||||
|
session_id,
|
||||||
|
len(result.get("zones", [])),
|
||||||
|
result.get("summary", {}).get("total_columns", 0),
|
||||||
|
result.get("summary", {}).get("total_rows", 0),
|
||||||
|
result.get("summary", {}).get("total_cells", 0),
|
||||||
|
result.get("boxes_detected", 0),
|
||||||
|
result.get("duration_seconds", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid")
|
||||||
|
async def rerun_ocr_and_build_grid(
|
||||||
|
session_id: str,
|
||||||
|
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
|
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
|
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
|
||||||
|
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
|
||||||
|
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
|
||||||
|
vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
|
||||||
|
doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
|
||||||
|
):
|
||||||
|
"""Re-run OCR with quality settings, then rebuild the grid.
|
||||||
|
|
||||||
|
Unlike build-grid (which only rebuilds from existing words),
|
||||||
|
this endpoint re-runs the full OCR pipeline on the cropped image
|
||||||
|
with optional CLAHE enhancement, then builds the grid.
|
||||||
|
|
||||||
|
Steps executed: Image Enhancement -> OCR -> Grid Build
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
import time as _time
|
||||||
|
t0 = _time.time()
|
||||||
|
|
||||||
|
# 1. Load the cropped/dewarped image from cache or session
|
||||||
|
if session_id not in _cache:
|
||||||
|
await _load_session_to_cache(session_id)
|
||||||
|
cached = _get_cached(session_id)
|
||||||
|
|
||||||
|
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
||||||
|
if dewarped_bgr is None:
|
||||||
|
raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.")
|
||||||
|
|
||||||
|
img_h, img_w = dewarped_bgr.shape[:2]
|
||||||
|
ocr_input = dewarped_bgr.copy()
|
||||||
|
|
||||||
|
# 2. Scan quality assessment
|
||||||
|
scan_quality_info = {}
|
||||||
|
try:
|
||||||
|
from scan_quality import score_scan_quality
|
||||||
|
quality_report = score_scan_quality(ocr_input)
|
||||||
|
scan_quality_info = quality_report.to_dict()
|
||||||
|
actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"rerun-ocr: scan quality failed: {e}")
|
||||||
|
actual_min_conf = min_conf if min_conf > 0 else 40
|
||||||
|
|
||||||
|
# 3. Image enhancement (Step 3)
|
||||||
|
is_degraded = scan_quality_info.get("is_degraded", False)
|
||||||
|
if enhance and is_degraded:
|
||||||
|
try:
|
||||||
|
from ocr_image_enhance import enhance_for_ocr
|
||||||
|
ocr_input = enhance_for_ocr(ocr_input, is_degraded=True)
|
||||||
|
logger.info("rerun-ocr: CLAHE enhancement applied")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"rerun-ocr: enhancement failed: {e}")
|
||||||
|
|
||||||
|
# 4. Run dual-engine OCR
|
||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
|
||||||
|
# RapidOCR
|
||||||
|
rapid_words = []
|
||||||
|
try:
|
||||||
|
from cv_ocr_engines import ocr_region_rapid
|
||||||
|
from cv_vocab_types import PageRegion
|
||||||
|
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
|
||||||
|
rapid_words = ocr_region_rapid(ocr_input, full_region) or []
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"rerun-ocr: RapidOCR failed: {e}")
|
||||||
|
|
||||||
|
# Tesseract
|
||||||
|
pil_img = Image.fromarray(ocr_input[:, :, ::-1])
|
||||||
|
data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT)
|
||||||
|
tess_words = []
|
||||||
|
for i in range(len(data["text"])):
|
||||||
|
text = (data["text"][i] or "").strip()
|
||||||
|
conf_raw = str(data["conf"][i])
|
||||||
|
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||||
|
if not text or conf < actual_min_conf:
|
||||||
|
continue
|
||||||
|
tess_words.append({
|
||||||
|
"text": text, "left": data["left"][i], "top": data["top"][i],
|
||||||
|
"width": data["width"][i], "height": data["height"][i], "conf": conf,
|
||||||
|
})
|
||||||
|
|
||||||
|
# 5. Merge OCR results
|
||||||
|
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
|
||||||
|
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
|
||||||
|
if rapid_split or tess_words:
|
||||||
|
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
|
||||||
|
merged_words = _deduplicate_words(merged_words)
|
||||||
|
else:
|
||||||
|
merged_words = tess_words
|
||||||
|
|
||||||
|
# 6. Store updated word_result in session
|
||||||
|
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
|
||||||
|
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
|
||||||
|
for w in merged_words]
|
||||||
|
word_result = {
|
||||||
|
"cells": [{"text": " ".join(w["text"] for w in merged_words),
|
||||||
|
"word_boxes": cells_for_storage}],
|
||||||
|
"image_width": img_w,
|
||||||
|
"image_height": img_h,
|
||||||
|
"ocr_engine": "rapid_kombi",
|
||||||
|
"word_count": len(merged_words),
|
||||||
|
"raw_paddle_words": rapid_words,
|
||||||
|
}
|
||||||
|
# 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
|
||||||
|
vision_applied = False
|
||||||
|
if vision_fusion:
|
||||||
|
try:
|
||||||
|
from vision_ocr_fusion import vision_fuse_ocr
|
||||||
|
category = doc_category or session.get("document_category") or "vokabelseite"
|
||||||
|
logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
|
||||||
|
merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
|
||||||
|
vision_applied = True
|
||||||
|
# Rebuild storage from fused words
|
||||||
|
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
|
||||||
|
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
|
||||||
|
for w in merged_words]
|
||||||
|
word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
|
||||||
|
"word_boxes": cells_for_storage}]
|
||||||
|
word_result["word_count"] = len(merged_words)
|
||||||
|
word_result["ocr_engine"] = "vision_fusion"
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
|
||||||
|
|
||||||
|
await update_session_db(session_id, word_result=word_result)
|
||||||
|
|
||||||
|
# Reload session with updated word_result
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
|
||||||
|
ocr_duration = _time.time() - t0
|
||||||
|
logger.info(
|
||||||
|
"rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs "
|
||||||
|
"(enhance=%s, min_conf=%d, quality=%s)",
|
||||||
|
session_id, len(merged_words), len(rapid_words), len(tess_words),
|
||||||
|
len(merged_words), ocr_duration, enhance, actual_min_conf,
|
||||||
|
scan_quality_info.get("quality_pct", "?"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# 7. Build grid from new words
|
||||||
|
try:
|
||||||
|
result = await _build_grid_core(
|
||||||
|
session_id, session,
|
||||||
|
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||||
|
enhance=enhance,
|
||||||
|
max_columns=max_cols if max_cols > 0 else None,
|
||||||
|
min_conf=min_conf if min_conf > 0 else None,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
# Persist grid
|
||||||
|
await update_session_db(session_id, grid_editor_result=result, current_step=11)
|
||||||
|
|
||||||
|
# Add quality info to response
|
||||||
|
result["scan_quality"] = scan_quality_info
|
||||||
|
result["ocr_stats"] = {
|
||||||
|
"rapid_words": len(rapid_words),
|
||||||
|
"tess_words": len(tess_words),
|
||||||
|
"merged_words": len(merged_words),
|
||||||
|
"min_conf_used": actual_min_conf,
|
||||||
|
"enhance_applied": enhance and is_degraded,
|
||||||
|
"vision_fusion_applied": vision_applied,
|
||||||
|
"document_category": doc_category or session.get("document_category", ""),
|
||||||
|
"ocr_duration_seconds": round(ocr_duration, 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
total_duration = _time.time() - t0
|
||||||
|
logger.info(
|
||||||
|
"rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs",
|
||||||
|
session_id,
|
||||||
|
len(result.get("zones", [])),
|
||||||
|
result.get("summary", {}).get("total_columns", 0),
|
||||||
|
result.get("summary", {}).get("total_cells", 0),
|
||||||
|
total_duration,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/save-grid")
|
||||||
|
async def save_grid(session_id: str, request: Request):
|
||||||
|
"""Save edited grid data from the frontend Excel-like editor.
|
||||||
|
|
||||||
|
Receives the full StructuredGrid with user edits (text changes,
|
||||||
|
formatting changes like bold columns, header rows, etc.) and
|
||||||
|
persists it to the session's grid_editor_result.
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
body = await request.json()
|
||||||
|
|
||||||
|
# Validate basic structure
|
||||||
|
if "zones" not in body:
|
||||||
|
raise HTTPException(status_code=400, detail="Missing 'zones' in request body")
|
||||||
|
|
||||||
|
# Preserve metadata from the original build
|
||||||
|
existing = session.get("grid_editor_result") or {}
|
||||||
|
result = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"image_width": body.get("image_width", existing.get("image_width", 0)),
|
||||||
|
"image_height": body.get("image_height", existing.get("image_height", 0)),
|
||||||
|
"zones": body["zones"],
|
||||||
|
"boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)),
|
||||||
|
"summary": body.get("summary", existing.get("summary", {})),
|
||||||
|
"formatting": body.get("formatting", existing.get("formatting", {})),
|
||||||
|
"duration_seconds": existing.get("duration_seconds", 0),
|
||||||
|
"edited": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
await update_session_db(session_id, grid_editor_result=result, current_step=11)
|
||||||
|
|
||||||
|
logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"]))
|
||||||
|
|
||||||
|
return {"session_id": session_id, "saved": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/grid-editor")
|
||||||
|
async def get_grid(session_id: str):
|
||||||
|
"""Retrieve the current grid editor state for a session."""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
result = session.get("grid_editor_result")
|
||||||
|
if not result:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="No grid editor data. Run build-grid first.",
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,110 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor API — gutter repair endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Request
|
||||||
|
|
||||||
|
from ocr_pipeline_session_store import (
|
||||||
|
get_session_db,
|
||||||
|
update_session_db,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/gutter-repair")
|
||||||
|
async def gutter_repair(session_id: str):
|
||||||
|
"""Analyse grid for gutter-edge OCR errors and return repair suggestions.
|
||||||
|
|
||||||
|
Detects:
|
||||||
|
- Words truncated/blurred at the book binding (spell_fix)
|
||||||
|
- Words split across rows with missing hyphen chars (hyphen_join)
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
grid_data = session.get("grid_editor_result")
|
||||||
|
if not grid_data:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="No grid data. Run build-grid first.",
|
||||||
|
)
|
||||||
|
|
||||||
|
from cv_gutter_repair import analyse_grid_for_gutter_repair
|
||||||
|
|
||||||
|
image_width = grid_data.get("image_width", 0)
|
||||||
|
result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width)
|
||||||
|
|
||||||
|
# Persist suggestions in ground_truth.gutter_repair (avoids DB migration)
|
||||||
|
gt = session.get("ground_truth") or {}
|
||||||
|
gt["gutter_repair"] = result
|
||||||
|
await update_session_db(session_id, ground_truth=gt)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"gutter-repair session %s: %d suggestions in %.2fs",
|
||||||
|
session_id,
|
||||||
|
result.get("stats", {}).get("suggestions_found", 0),
|
||||||
|
result.get("duration_seconds", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/gutter-repair/apply")
|
||||||
|
async def gutter_repair_apply(session_id: str, request: Request):
|
||||||
|
"""Apply accepted gutter repair suggestions to the grid.
|
||||||
|
|
||||||
|
Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] }
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
grid_data = session.get("grid_editor_result")
|
||||||
|
if not grid_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No grid data.")
|
||||||
|
|
||||||
|
gt = session.get("ground_truth") or {}
|
||||||
|
gutter_result = gt.get("gutter_repair")
|
||||||
|
if not gutter_result:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="No gutter repair data. Run gutter-repair first.",
|
||||||
|
)
|
||||||
|
|
||||||
|
body = await request.json()
|
||||||
|
accepted_ids = body.get("accepted", [])
|
||||||
|
if not accepted_ids:
|
||||||
|
return {"applied_count": 0, "changes": []}
|
||||||
|
|
||||||
|
# text_overrides: { suggestion_id: "alternative_text" }
|
||||||
|
# Allows the user to pick a different correction from the alternatives list
|
||||||
|
text_overrides = body.get("text_overrides", {})
|
||||||
|
|
||||||
|
from cv_gutter_repair import apply_gutter_suggestions
|
||||||
|
|
||||||
|
suggestions = gutter_result.get("suggestions", [])
|
||||||
|
|
||||||
|
# Apply user-selected alternatives before passing to apply
|
||||||
|
for s in suggestions:
|
||||||
|
sid = s.get("id", "")
|
||||||
|
if sid in text_overrides and text_overrides[sid]:
|
||||||
|
s["suggested_text"] = text_overrides[sid]
|
||||||
|
|
||||||
|
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
|
||||||
|
|
||||||
|
# Save updated grid back to session
|
||||||
|
await update_session_db(session_id, grid_editor_result=grid_data)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"gutter-repair/apply session %s: %d changes applied",
|
||||||
|
session_id,
|
||||||
|
result.get("applied_count", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor API — unified grid endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from ocr_pipeline_session_store import (
|
||||||
|
get_session_db,
|
||||||
|
update_session_db,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/build-unified-grid")
|
||||||
|
async def build_unified_grid_endpoint(session_id: str):
|
||||||
|
"""Build a single-zone unified grid merging content + box zones.
|
||||||
|
|
||||||
|
Takes the existing multi-zone grid_editor_result and produces a
|
||||||
|
unified grid where boxes are integrated into the main row sequence.
|
||||||
|
Persists as unified_grid_result (preserves original multi-zone data).
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
grid_data = session.get("grid_editor_result")
|
||||||
|
if not grid_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
|
||||||
|
|
||||||
|
from unified_grid import build_unified_grid
|
||||||
|
|
||||||
|
result = build_unified_grid(
|
||||||
|
zones=grid_data.get("zones", []),
|
||||||
|
image_width=grid_data.get("image_width", 0),
|
||||||
|
image_height=grid_data.get("image_height", 0),
|
||||||
|
layout_metrics=grid_data.get("layout_metrics", {}),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Persist as separate field (don't overwrite original multi-zone grid)
|
||||||
|
await update_session_db(session_id, unified_grid_result=result)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"build-unified-grid session %s: %d rows, %d cells",
|
||||||
|
session_id,
|
||||||
|
result.get("summary", {}).get("total_rows", 0),
|
||||||
|
result.get("summary", {}).get("total_cells", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/unified-grid")
|
||||||
|
async def get_unified_grid(session_id: str):
|
||||||
|
"""Retrieve the unified grid for a session."""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
result = session.get("unified_grid_result")
|
||||||
|
if not result:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="No unified grid. Run build-unified-grid first.",
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,492 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor — column detection, cross-column splitting, marker merging.
|
||||||
|
|
||||||
|
Split from grid_editor_helpers.py for maintainability.
|
||||||
|
All functions are pure computation — no HTTP, DB, or session side effects.
|
||||||
|
|
||||||
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||||
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cross-column word splitting
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_spell_cache: Optional[Any] = None
|
||||||
|
_spell_loaded = False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_recognized_word(text: str) -> bool:
|
||||||
|
"""Check if *text* is a recognized German or English word.
|
||||||
|
|
||||||
|
Uses the spellchecker library (same as cv_syllable_detect.py).
|
||||||
|
Returns True for real words like "oder", "Kabel", "Zeitung".
|
||||||
|
Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
|
||||||
|
"""
|
||||||
|
global _spell_cache, _spell_loaded
|
||||||
|
if not text or len(text) < 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not _spell_loaded:
|
||||||
|
_spell_loaded = True
|
||||||
|
try:
|
||||||
|
from spellchecker import SpellChecker
|
||||||
|
_spell_cache = SpellChecker(language="de")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if _spell_cache is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return text.lower() in _spell_cache
|
||||||
|
|
||||||
|
|
||||||
|
def _split_cross_column_words(
|
||||||
|
words: List[Dict],
|
||||||
|
columns: List[Dict],
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Split word boxes that span across column boundaries.
|
||||||
|
|
||||||
|
When OCR merges adjacent words from different columns (e.g. "sichzie"
|
||||||
|
spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
|
||||||
|
split the word box at the column boundary so each piece is assigned
|
||||||
|
to the correct column.
|
||||||
|
|
||||||
|
Only splits when:
|
||||||
|
- The word has significant overlap (>15% of its width) on both sides
|
||||||
|
- AND the word is not a recognized real word (OCR merge artifact), OR
|
||||||
|
the word contains a case transition (lowercase->uppercase) near the
|
||||||
|
boundary indicating two merged words like "dasZimmer".
|
||||||
|
"""
|
||||||
|
if len(columns) < 2:
|
||||||
|
return words
|
||||||
|
|
||||||
|
# Column boundaries = midpoints between adjacent column edges
|
||||||
|
boundaries = []
|
||||||
|
for i in range(len(columns) - 1):
|
||||||
|
boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
|
||||||
|
boundaries.append(boundary)
|
||||||
|
|
||||||
|
new_words: List[Dict] = []
|
||||||
|
split_count = 0
|
||||||
|
for w in words:
|
||||||
|
w_left = w["left"]
|
||||||
|
w_width = w["width"]
|
||||||
|
w_right = w_left + w_width
|
||||||
|
text = (w.get("text") or "").strip()
|
||||||
|
|
||||||
|
if not text or len(text) < 4 or w_width < 10:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find the first boundary this word straddles significantly
|
||||||
|
split_boundary = None
|
||||||
|
for b in boundaries:
|
||||||
|
if w_left < b < w_right:
|
||||||
|
left_part = b - w_left
|
||||||
|
right_part = w_right - b
|
||||||
|
# Both sides must have at least 15% of the word width
|
||||||
|
if left_part > w_width * 0.15 and right_part > w_width * 0.15:
|
||||||
|
split_boundary = b
|
||||||
|
break
|
||||||
|
|
||||||
|
if split_boundary is None:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute approximate split position in the text.
|
||||||
|
left_width = split_boundary - w_left
|
||||||
|
split_ratio = left_width / w_width
|
||||||
|
approx_pos = len(text) * split_ratio
|
||||||
|
|
||||||
|
# Strategy 1: look for a case transition (lowercase->uppercase) near
|
||||||
|
# the approximate split point — e.g. "dasZimmer" splits at 'Z'.
|
||||||
|
split_char = None
|
||||||
|
search_lo = max(1, int(approx_pos) - 3)
|
||||||
|
search_hi = min(len(text), int(approx_pos) + 2)
|
||||||
|
for i in range(search_lo, search_hi):
|
||||||
|
if text[i - 1].islower() and text[i].isupper():
|
||||||
|
split_char = i
|
||||||
|
break
|
||||||
|
|
||||||
|
# Strategy 2: if no case transition, only split if the whole word
|
||||||
|
# is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
|
||||||
|
# Real words like "oder", "Kabel", "Zeitung" must not be split.
|
||||||
|
if split_char is None:
|
||||||
|
clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct
|
||||||
|
if _is_recognized_word(clean):
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
# Not a real word — use floor of proportional position
|
||||||
|
split_char = max(1, min(len(text) - 1, int(approx_pos)))
|
||||||
|
|
||||||
|
left_text = text[:split_char].rstrip()
|
||||||
|
right_text = text[split_char:].lstrip()
|
||||||
|
|
||||||
|
if len(left_text) < 2 or len(right_text) < 2:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
|
||||||
|
right_width = w_width - round(left_width)
|
||||||
|
new_words.append({
|
||||||
|
**w,
|
||||||
|
"text": left_text,
|
||||||
|
"width": round(left_width),
|
||||||
|
})
|
||||||
|
new_words.append({
|
||||||
|
**w,
|
||||||
|
"text": right_text,
|
||||||
|
"left": round(split_boundary),
|
||||||
|
"width": right_width,
|
||||||
|
})
|
||||||
|
split_count += 1
|
||||||
|
logger.info(
|
||||||
|
"split cross-column word %r -> %r + %r at boundary %.0f",
|
||||||
|
text, left_text, right_text, split_boundary,
|
||||||
|
)
|
||||||
|
|
||||||
|
if split_count:
|
||||||
|
logger.info("split %d cross-column word(s)", split_count)
|
||||||
|
return new_words
|
||||||
|
|
||||||
|
|
||||||
|
def _cluster_columns_by_alignment(
|
||||||
|
words: List[Dict],
|
||||||
|
zone_w: int,
|
||||||
|
rows: List[Dict],
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Detect columns by clustering left-edge alignment across rows.
|
||||||
|
|
||||||
|
Hybrid approach:
|
||||||
|
1. Group words by row, find "group start" positions within each row
|
||||||
|
(words preceded by a large gap or first word in row)
|
||||||
|
2. Cluster group-start left-edges by X-proximity across rows
|
||||||
|
3. Filter by row coverage (how many rows have a group start here)
|
||||||
|
4. Merge nearby clusters
|
||||||
|
5. Build column boundaries
|
||||||
|
|
||||||
|
This filters out mid-phrase word positions (e.g. IPA transcriptions,
|
||||||
|
second words in multi-word entries) by only considering positions
|
||||||
|
where a new word group begins within a row.
|
||||||
|
"""
|
||||||
|
if not words or not rows:
|
||||||
|
return []
|
||||||
|
|
||||||
|
total_rows = len(rows)
|
||||||
|
if total_rows == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# --- Group words by row ---
|
||||||
|
row_words: Dict[int, List[Dict]] = {}
|
||||||
|
for w in words:
|
||||||
|
y_center = w["top"] + w["height"] / 2
|
||||||
|
best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
|
||||||
|
row_words.setdefault(best["index"], []).append(w)
|
||||||
|
|
||||||
|
# --- Compute adaptive gap threshold for group-start detection ---
|
||||||
|
all_gaps: List[float] = []
|
||||||
|
for ri, rw_list in row_words.items():
|
||||||
|
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
|
||||||
|
for i in range(len(sorted_rw) - 1):
|
||||||
|
right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
|
||||||
|
gap = sorted_rw[i + 1]["left"] - right
|
||||||
|
if gap > 0:
|
||||||
|
all_gaps.append(gap)
|
||||||
|
|
||||||
|
if all_gaps:
|
||||||
|
sorted_gaps = sorted(all_gaps)
|
||||||
|
median_gap = sorted_gaps[len(sorted_gaps) // 2]
|
||||||
|
heights = [w["height"] for w in words if w.get("height", 0) > 0]
|
||||||
|
median_h = sorted(heights)[len(heights) // 2] if heights else 25
|
||||||
|
|
||||||
|
# For small word counts (boxes, sub-zones): PaddleOCR returns
|
||||||
|
# multi-word blocks, so ALL inter-word gaps are potential column
|
||||||
|
# boundaries. Use a low threshold based on word height — any gap
|
||||||
|
# wider than ~1x median word height is a column separator.
|
||||||
|
if len(words) <= 60:
|
||||||
|
gap_threshold = max(median_h * 1.0, 25)
|
||||||
|
logger.info(
|
||||||
|
"alignment columns (small zone): gap_threshold=%.0f "
|
||||||
|
"(median_h=%.0f, %d words, %d gaps: %s)",
|
||||||
|
gap_threshold, median_h, len(words), len(sorted_gaps),
|
||||||
|
[int(g) for g in sorted_gaps[:10]],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Standard approach for large zones (full pages)
|
||||||
|
gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
|
||||||
|
# Cap at 25% of zone width
|
||||||
|
max_gap = zone_w * 0.25
|
||||||
|
if gap_threshold > max_gap > 30:
|
||||||
|
logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w)
|
||||||
|
gap_threshold = max_gap
|
||||||
|
else:
|
||||||
|
gap_threshold = 50
|
||||||
|
|
||||||
|
# --- Find group-start positions (left-edges that begin a new column) ---
|
||||||
|
start_positions: List[tuple] = [] # (left_edge, row_index)
|
||||||
|
for ri, rw_list in row_words.items():
|
||||||
|
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
|
||||||
|
# First word in row is always a group start
|
||||||
|
start_positions.append((sorted_rw[0]["left"], ri))
|
||||||
|
for i in range(1, len(sorted_rw)):
|
||||||
|
right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
|
||||||
|
gap = sorted_rw[i]["left"] - right_prev
|
||||||
|
if gap >= gap_threshold:
|
||||||
|
start_positions.append((sorted_rw[i]["left"], ri))
|
||||||
|
|
||||||
|
start_positions.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"alignment columns: %d group-start positions from %d words "
|
||||||
|
"(gap_threshold=%.0f, %d rows)",
|
||||||
|
len(start_positions), len(words), gap_threshold, total_rows,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not start_positions:
|
||||||
|
x_min = min(w["left"] for w in words)
|
||||||
|
x_max = max(w["left"] + w["width"] for w in words)
|
||||||
|
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
|
||||||
|
|
||||||
|
# --- Cluster group-start positions by X-proximity ---
|
||||||
|
tolerance = max(10, int(zone_w * 0.01))
|
||||||
|
clusters: List[Dict[str, Any]] = []
|
||||||
|
cur_edges = [start_positions[0][0]]
|
||||||
|
cur_rows = {start_positions[0][1]}
|
||||||
|
|
||||||
|
for left, row_idx in start_positions[1:]:
|
||||||
|
if left - cur_edges[-1] <= tolerance:
|
||||||
|
cur_edges.append(left)
|
||||||
|
cur_rows.add(row_idx)
|
||||||
|
else:
|
||||||
|
clusters.append({
|
||||||
|
"mean_x": int(sum(cur_edges) / len(cur_edges)),
|
||||||
|
"min_edge": min(cur_edges),
|
||||||
|
"max_edge": max(cur_edges),
|
||||||
|
"count": len(cur_edges),
|
||||||
|
"distinct_rows": len(cur_rows),
|
||||||
|
"row_coverage": len(cur_rows) / total_rows,
|
||||||
|
})
|
||||||
|
cur_edges = [left]
|
||||||
|
cur_rows = {row_idx}
|
||||||
|
clusters.append({
|
||||||
|
"mean_x": int(sum(cur_edges) / len(cur_edges)),
|
||||||
|
"min_edge": min(cur_edges),
|
||||||
|
"max_edge": max(cur_edges),
|
||||||
|
"count": len(cur_edges),
|
||||||
|
"distinct_rows": len(cur_rows),
|
||||||
|
"row_coverage": len(cur_rows) / total_rows,
|
||||||
|
})
|
||||||
|
|
||||||
|
# --- Filter by row coverage ---
|
||||||
|
# These thresholds must be high enough to avoid false columns in flowing
|
||||||
|
# text (random inter-word gaps) while still detecting real columns in
|
||||||
|
# vocabulary worksheets (which typically have >80% row coverage).
|
||||||
|
MIN_COVERAGE_PRIMARY = 0.35
|
||||||
|
MIN_COVERAGE_SECONDARY = 0.12
|
||||||
|
MIN_WORDS_SECONDARY = 4
|
||||||
|
MIN_DISTINCT_ROWS = 3
|
||||||
|
|
||||||
|
# Content boundary for left-margin detection
|
||||||
|
content_x_min = min(w["left"] for w in words)
|
||||||
|
content_x_max = max(w["left"] + w["width"] for w in words)
|
||||||
|
content_span = content_x_max - content_x_min
|
||||||
|
|
||||||
|
primary = [
|
||||||
|
c for c in clusters
|
||||||
|
if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
|
||||||
|
and c["distinct_rows"] >= MIN_DISTINCT_ROWS
|
||||||
|
]
|
||||||
|
primary_ids = {id(c) for c in primary}
|
||||||
|
secondary = [
|
||||||
|
c for c in clusters
|
||||||
|
if id(c) not in primary_ids
|
||||||
|
and c["row_coverage"] >= MIN_COVERAGE_SECONDARY
|
||||||
|
and c["count"] >= MIN_WORDS_SECONDARY
|
||||||
|
and c["distinct_rows"] >= MIN_DISTINCT_ROWS
|
||||||
|
]
|
||||||
|
|
||||||
|
# Tertiary: narrow left-margin columns (page refs, markers) that have
|
||||||
|
# too few rows for secondary but are clearly left-aligned and separated
|
||||||
|
# from the main content. These appear at the far left or far right and
|
||||||
|
# have a large gap to the nearest significant cluster.
|
||||||
|
used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
|
||||||
|
sig_xs = [c["mean_x"] for c in primary + secondary]
|
||||||
|
|
||||||
|
# Tertiary: clusters that are clearly to the LEFT of the first
|
||||||
|
# significant column (or RIGHT of the last). If words consistently
|
||||||
|
# start at a position left of the established first column boundary,
|
||||||
|
# they MUST be a separate column — regardless of how few rows they
|
||||||
|
# cover. The only requirement is a clear spatial gap.
|
||||||
|
MIN_COVERAGE_TERTIARY = 0.02 # at least 1 row effectively
|
||||||
|
tertiary = []
|
||||||
|
for c in clusters:
|
||||||
|
if id(c) in used_ids:
|
||||||
|
continue
|
||||||
|
if c["distinct_rows"] < 1:
|
||||||
|
continue
|
||||||
|
if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
|
||||||
|
continue
|
||||||
|
# Must be near left or right content margin (within 15%)
|
||||||
|
rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
|
||||||
|
if not (rel_pos < 0.15 or rel_pos > 0.85):
|
||||||
|
continue
|
||||||
|
# Must have significant gap to nearest significant cluster
|
||||||
|
if sig_xs:
|
||||||
|
min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
|
||||||
|
if min_dist < max(30, content_span * 0.02):
|
||||||
|
continue
|
||||||
|
tertiary.append(c)
|
||||||
|
|
||||||
|
if tertiary:
|
||||||
|
for c in tertiary:
|
||||||
|
logger.info(
|
||||||
|
" tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
|
||||||
|
c["mean_x"], c["min_edge"], c["max_edge"],
|
||||||
|
c["count"], c["distinct_rows"], c["row_coverage"] * 100,
|
||||||
|
)
|
||||||
|
|
||||||
|
significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
|
||||||
|
|
||||||
|
for c in significant:
|
||||||
|
logger.info(
|
||||||
|
" significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
|
||||||
|
c["mean_x"], c["min_edge"], c["max_edge"],
|
||||||
|
c["count"], c["distinct_rows"], c["row_coverage"] * 100,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"alignment columns: %d clusters, %d primary, %d secondary -> %d significant",
|
||||||
|
len(clusters), len(primary), len(secondary), len(significant),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not significant:
|
||||||
|
# Fallback: single column covering all content
|
||||||
|
x_min = min(w["left"] for w in words)
|
||||||
|
x_max = max(w["left"] + w["width"] for w in words)
|
||||||
|
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
|
||||||
|
|
||||||
|
# --- Merge nearby clusters ---
|
||||||
|
merge_distance = max(25, int(zone_w * 0.03))
|
||||||
|
merged = [significant[0].copy()]
|
||||||
|
for s in significant[1:]:
|
||||||
|
if s["mean_x"] - merged[-1]["mean_x"] < merge_distance:
|
||||||
|
prev = merged[-1]
|
||||||
|
total = prev["count"] + s["count"]
|
||||||
|
prev["mean_x"] = (
|
||||||
|
prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"]
|
||||||
|
) // total
|
||||||
|
prev["count"] = total
|
||||||
|
prev["min_edge"] = min(prev["min_edge"], s["min_edge"])
|
||||||
|
prev["max_edge"] = max(prev["max_edge"], s["max_edge"])
|
||||||
|
prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"])
|
||||||
|
else:
|
||||||
|
merged.append(s.copy())
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"alignment columns: %d after merge (distance=%d)",
|
||||||
|
len(merged), merge_distance,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Build column boundaries ---
|
||||||
|
margin = max(5, int(zone_w * 0.005))
|
||||||
|
content_x_min = min(w["left"] for w in words)
|
||||||
|
content_x_max = max(w["left"] + w["width"] for w in words)
|
||||||
|
|
||||||
|
columns: List[Dict[str, Any]] = []
|
||||||
|
for i, cluster in enumerate(merged):
|
||||||
|
x_min = max(content_x_min, cluster["min_edge"] - margin)
|
||||||
|
if i + 1 < len(merged):
|
||||||
|
x_max = merged[i + 1]["min_edge"] - margin
|
||||||
|
else:
|
||||||
|
x_max = content_x_max
|
||||||
|
|
||||||
|
columns.append({
|
||||||
|
"index": i,
|
||||||
|
"type": f"column_{i + 1}" if len(merged) > 1 else "column_text",
|
||||||
|
"x_min": x_min,
|
||||||
|
"x_max": x_max,
|
||||||
|
})
|
||||||
|
|
||||||
|
return columns
|
||||||
|
|
||||||
|
|
||||||
|
_MARKER_CHARS = set("*-+#>")
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_inline_marker_columns(
|
||||||
|
columns: List[Dict],
|
||||||
|
words: List[Dict],
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Merge narrow marker columns (bullets, numbering) into adjacent text.
|
||||||
|
|
||||||
|
Bullet points (*, -) and numbering (1., 2.) create narrow columns
|
||||||
|
at the left edge of a zone. These are inline markers that indent text,
|
||||||
|
not real separate columns. Merge them with their right neighbour.
|
||||||
|
|
||||||
|
Does NOT merge columns containing alphabetic words like "to", "in",
|
||||||
|
"der", "die", "das" — those are legitimate content columns.
|
||||||
|
"""
|
||||||
|
if len(columns) < 2:
|
||||||
|
return columns
|
||||||
|
|
||||||
|
merged: List[Dict] = []
|
||||||
|
skip: set = set()
|
||||||
|
|
||||||
|
for i, col in enumerate(columns):
|
||||||
|
if i in skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find words in this column
|
||||||
|
col_words = [
|
||||||
|
w for w in words
|
||||||
|
if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
|
||||||
|
]
|
||||||
|
col_width = col["x_max"] - col["x_min"]
|
||||||
|
|
||||||
|
# Narrow column with mostly short words -> MIGHT be inline markers
|
||||||
|
if col_words and col_width < 80:
|
||||||
|
avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
|
||||||
|
if avg_len <= 2 and i + 1 < len(columns):
|
||||||
|
# Check if words are actual markers (symbols/numbers) vs
|
||||||
|
# real alphabetic words like "to", "in", "der", "die"
|
||||||
|
texts = [(w.get("text") or "").strip() for w in col_words]
|
||||||
|
alpha_count = sum(
|
||||||
|
1 for t in texts
|
||||||
|
if t and t[0].isalpha() and t not in _MARKER_CHARS
|
||||||
|
)
|
||||||
|
alpha_ratio = alpha_count / len(texts) if texts else 0
|
||||||
|
|
||||||
|
# If >=50% of words are alphabetic, this is a real column
|
||||||
|
if alpha_ratio >= 0.5:
|
||||||
|
logger.info(
|
||||||
|
" kept narrow column %d (w=%d, avg_len=%.1f, "
|
||||||
|
"alpha=%.0f%%) -- contains real words",
|
||||||
|
i, col_width, avg_len, alpha_ratio * 100,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Merge into next column
|
||||||
|
next_col = columns[i + 1].copy()
|
||||||
|
next_col["x_min"] = col["x_min"]
|
||||||
|
merged.append(next_col)
|
||||||
|
skip.add(i + 1)
|
||||||
|
logger.info(
|
||||||
|
" merged inline marker column %d (w=%d, avg_len=%.1f) "
|
||||||
|
"into column %d",
|
||||||
|
i, col_width, avg_len, i + 1,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
merged.append(col)
|
||||||
|
|
||||||
|
# Re-index
|
||||||
|
for i, col in enumerate(merged):
|
||||||
|
col["index"] = i
|
||||||
|
col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
|
||||||
|
|
||||||
|
return merged
|
||||||
@@ -0,0 +1,402 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor — word/zone filtering, border ghosts, decorative margins, footers.
|
||||||
|
|
||||||
|
Split from grid_editor_helpers.py for maintainability.
|
||||||
|
All functions are pure computation — no HTTP, DB, or session side effects.
|
||||||
|
|
||||||
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||||
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
||||||
|
"""Remove page-border decoration strip words BEFORE column detection.
|
||||||
|
|
||||||
|
Scans from each page edge inward to find the first significant x-gap
|
||||||
|
(>30 px). If the edge cluster contains <15 % of total words, those
|
||||||
|
words are removed as border-strip artifacts (alphabet letters,
|
||||||
|
illustration fragments).
|
||||||
|
|
||||||
|
Must run BEFORE ``_build_zone_grid`` so that column detection only
|
||||||
|
sees real content words and doesn't produce inflated row counts.
|
||||||
|
"""
|
||||||
|
if len(words) < 10:
|
||||||
|
return words, 0
|
||||||
|
|
||||||
|
sorted_words = sorted(words, key=lambda w: w.get("left", 0))
|
||||||
|
total = len(sorted_words)
|
||||||
|
|
||||||
|
# -- Left-edge scan (running max right-edge) --
|
||||||
|
left_count = 0
|
||||||
|
running_right = 0
|
||||||
|
for gi in range(total - 1):
|
||||||
|
running_right = max(
|
||||||
|
running_right,
|
||||||
|
sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
|
||||||
|
)
|
||||||
|
if sorted_words[gi + 1].get("left", 0) - running_right > 30:
|
||||||
|
left_count = gi + 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# -- Right-edge scan (running min left) --
|
||||||
|
right_count = 0
|
||||||
|
running_left = sorted_words[-1].get("left", 0)
|
||||||
|
for gi in range(total - 1, 0, -1):
|
||||||
|
running_left = min(running_left, sorted_words[gi].get("left", 0))
|
||||||
|
prev_right = (
|
||||||
|
sorted_words[gi - 1].get("left", 0)
|
||||||
|
+ sorted_words[gi - 1].get("width", 0)
|
||||||
|
)
|
||||||
|
if running_left - prev_right > 30:
|
||||||
|
right_count = total - gi
|
||||||
|
break
|
||||||
|
|
||||||
|
# Validate candidate strip: real border decorations are mostly short
|
||||||
|
# words (alphabet letters like "A", "Bb", stray marks). Multi-word
|
||||||
|
# content like "der Ranzen" or "die Schals" (continuation of German
|
||||||
|
# translations) must NOT be removed.
|
||||||
|
def _is_decorative_strip(candidates: List[Dict]) -> bool:
|
||||||
|
if not candidates:
|
||||||
|
return False
|
||||||
|
short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
|
||||||
|
return short / len(candidates) >= 0.45
|
||||||
|
|
||||||
|
strip_ids: set = set()
|
||||||
|
if left_count > 0 and left_count / total < 0.20:
|
||||||
|
candidates = sorted_words[:left_count]
|
||||||
|
if _is_decorative_strip(candidates):
|
||||||
|
strip_ids = {id(w) for w in candidates}
|
||||||
|
elif right_count > 0 and right_count / total < 0.20:
|
||||||
|
candidates = sorted_words[total - right_count:]
|
||||||
|
if _is_decorative_strip(candidates):
|
||||||
|
strip_ids = {id(w) for w in candidates}
|
||||||
|
|
||||||
|
if not strip_ids:
|
||||||
|
return words, 0
|
||||||
|
|
||||||
|
return [w for w in words if id(w) not in strip_ids], len(strip_ids)
|
||||||
|
|
||||||
|
|
||||||
|
# Characters that are typically OCR artefacts from box border lines.
|
||||||
|
# Intentionally excludes ! (red markers) and . , ; (real punctuation).
|
||||||
|
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+")
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_border_ghosts(
|
||||||
|
words: List[Dict],
|
||||||
|
boxes: List,
|
||||||
|
) -> tuple:
|
||||||
|
"""Remove words sitting on box borders that are OCR artefacts.
|
||||||
|
|
||||||
|
Returns (filtered_words, removed_count).
|
||||||
|
"""
|
||||||
|
if not boxes or not words:
|
||||||
|
return words, 0
|
||||||
|
|
||||||
|
# Build border bands from detected boxes
|
||||||
|
x_bands: List[tuple] = []
|
||||||
|
y_bands: List[tuple] = []
|
||||||
|
for b in boxes:
|
||||||
|
bt = (
|
||||||
|
b.border_thickness
|
||||||
|
if hasattr(b, "border_thickness")
|
||||||
|
else b.get("border_thickness", 3)
|
||||||
|
)
|
||||||
|
# Skip borderless boxes (images/graphics) -- no border line to produce ghosts
|
||||||
|
if bt == 0:
|
||||||
|
continue
|
||||||
|
bx = b.x if hasattr(b, "x") else b.get("x", 0)
|
||||||
|
by = b.y if hasattr(b, "y") else b.get("y", 0)
|
||||||
|
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
|
||||||
|
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
|
||||||
|
margin = max(bt * 2, 10) + 6
|
||||||
|
x_bands.append((bx - margin, bx + margin))
|
||||||
|
x_bands.append((bx + bw - margin, bx + bw + margin))
|
||||||
|
y_bands.append((by - margin, by + margin))
|
||||||
|
y_bands.append((by + bh - margin, by + bh + margin))
|
||||||
|
|
||||||
|
def _is_ghost(w: Dict) -> bool:
|
||||||
|
text = (w.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
# Check if any word edge (not just center) touches a border band
|
||||||
|
w_left = w["left"]
|
||||||
|
w_right = w["left"] + w["width"]
|
||||||
|
w_top = w["top"]
|
||||||
|
w_bottom = w["top"] + w["height"]
|
||||||
|
on_border = (
|
||||||
|
any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
|
||||||
|
or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
|
||||||
|
)
|
||||||
|
if not on_border:
|
||||||
|
return False
|
||||||
|
if len(text) == 1 and text in _GRID_GHOST_CHARS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
filtered = [w for w in words if not _is_ghost(w)]
|
||||||
|
return filtered, len(words) - len(filtered)
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
|
||||||
|
"""Extract all word_boxes from cells into a flat list of word dicts."""
|
||||||
|
words: List[Dict] = []
|
||||||
|
for cell in cells:
|
||||||
|
for wb in cell.get("word_boxes") or []:
|
||||||
|
if wb.get("text", "").strip():
|
||||||
|
words.append({
|
||||||
|
"text": wb["text"],
|
||||||
|
"left": wb["left"],
|
||||||
|
"top": wb["top"],
|
||||||
|
"width": wb["width"],
|
||||||
|
"height": wb["height"],
|
||||||
|
"conf": wb.get("conf", 0),
|
||||||
|
})
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def _words_in_zone(
|
||||||
|
words: List[Dict],
|
||||||
|
zone_y: int,
|
||||||
|
zone_h: int,
|
||||||
|
zone_x: int,
|
||||||
|
zone_w: int,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Filter words whose Y-center falls within a zone's bounds."""
|
||||||
|
zone_y_end = zone_y + zone_h
|
||||||
|
zone_x_end = zone_x + zone_w
|
||||||
|
result = []
|
||||||
|
for w in words:
|
||||||
|
cy = w["top"] + w["height"] / 2
|
||||||
|
cx = w["left"] + w["width"] / 2
|
||||||
|
if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end:
|
||||||
|
result.append(w)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_content_bounds(words: List[Dict]) -> tuple:
|
||||||
|
"""Get content bounds from word positions."""
|
||||||
|
if not words:
|
||||||
|
return 0, 0, 0, 0
|
||||||
|
x_min = min(w["left"] for w in words)
|
||||||
|
y_min = min(w["top"] for w in words)
|
||||||
|
x_max = max(w["left"] + w["width"] for w in words)
|
||||||
|
y_max = max(w["top"] + w["height"] for w in words)
|
||||||
|
return x_min, y_min, x_max - x_min, y_max - y_min
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_decorative_margin(
|
||||||
|
words: List[Dict],
|
||||||
|
img_w: int,
|
||||||
|
log: Any,
|
||||||
|
session_id: str,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Remove words that belong to a decorative alphabet strip on a margin.
|
||||||
|
|
||||||
|
Some vocabulary worksheets have a vertical A-Z alphabet graphic along
|
||||||
|
the left or right edge. OCR reads each letter as an isolated single-
|
||||||
|
character word. These decorative elements are not content and confuse
|
||||||
|
column/row detection.
|
||||||
|
|
||||||
|
Detection criteria (phase 1 -- find the strip using single-char words):
|
||||||
|
- Words are in the outer 30% of the page (left or right)
|
||||||
|
- Nearly all words are single characters (letters or digits)
|
||||||
|
- At least 8 such words form a vertical strip (>=8 unique Y positions)
|
||||||
|
- Average horizontal spread of the strip is small (< 80px)
|
||||||
|
|
||||||
|
Phase 2 -- once a strip is confirmed, also remove any short word (<=3
|
||||||
|
chars) in the same narrow x-range. This catches multi-char OCR
|
||||||
|
artifacts like "Vv" that belong to the same decorative element.
|
||||||
|
|
||||||
|
Modifies *words* in place.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
|
||||||
|
"""
|
||||||
|
no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
|
||||||
|
if not words or img_w <= 0:
|
||||||
|
return no_strip
|
||||||
|
|
||||||
|
margin_cutoff = img_w * 0.30
|
||||||
|
# Phase 1: find candidate strips using short words (1-2 chars).
|
||||||
|
# OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
|
||||||
|
# rather than singles, so accept <=2-char words as strip candidates.
|
||||||
|
left_strip = [
|
||||||
|
w for w in words
|
||||||
|
if len((w.get("text") or "").strip()) <= 2
|
||||||
|
and w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
||||||
|
]
|
||||||
|
right_strip = [
|
||||||
|
w for w in words
|
||||||
|
if len((w.get("text") or "").strip()) <= 2
|
||||||
|
and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
|
||||||
|
]
|
||||||
|
|
||||||
|
for strip, side in [(left_strip, "left"), (right_strip, "right")]:
|
||||||
|
if len(strip) < 6:
|
||||||
|
continue
|
||||||
|
# Check vertical distribution: should have many distinct Y positions
|
||||||
|
y_centers = sorted(set(
|
||||||
|
int(w["top"] + w.get("height", 0) / 2) // 20 * 20 # bucket
|
||||||
|
for w in strip
|
||||||
|
))
|
||||||
|
if len(y_centers) < 6:
|
||||||
|
continue
|
||||||
|
# Check horizontal compactness
|
||||||
|
x_positions = [w["left"] for w in strip]
|
||||||
|
x_min = min(x_positions)
|
||||||
|
x_max = max(x_positions)
|
||||||
|
x_spread = x_max - x_min
|
||||||
|
if x_spread > 80:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Phase 2: strip confirmed -- also collect short words in same x-range
|
||||||
|
# Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
|
||||||
|
strip_x_lo = x_min - 20
|
||||||
|
strip_x_hi = x_max + 60 # word width + tolerance
|
||||||
|
all_strip_words = [
|
||||||
|
w for w in words
|
||||||
|
if len((w.get("text") or "").strip()) <= 3
|
||||||
|
and strip_x_lo <= w["left"] <= strip_x_hi
|
||||||
|
and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
||||||
|
if side == "left"
|
||||||
|
else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
|
||||||
|
]
|
||||||
|
|
||||||
|
strip_set = set(id(w) for w in all_strip_words)
|
||||||
|
before = len(words)
|
||||||
|
words[:] = [w for w in words if id(w) not in strip_set]
|
||||||
|
removed = before - len(words)
|
||||||
|
if removed:
|
||||||
|
log.info(
|
||||||
|
"build-grid session %s: removed %d decorative %s-margin words "
|
||||||
|
"(strip x=%d-%d)",
|
||||||
|
session_id, removed, side, strip_x_lo, strip_x_hi,
|
||||||
|
)
|
||||||
|
return {"found": True, "side": side, "letters_detected": len(strip)}
|
||||||
|
|
||||||
|
return no_strip
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_footer_words(
|
||||||
|
words: List[Dict],
|
||||||
|
img_h: int,
|
||||||
|
log: Any,
|
||||||
|
session_id: str,
|
||||||
|
) -> Optional[Dict]:
|
||||||
|
"""Remove isolated words in the bottom 5% of the page (page numbers).
|
||||||
|
|
||||||
|
Modifies *words* in place and returns a page_number metadata dict
|
||||||
|
if a page number was extracted, or None.
|
||||||
|
"""
|
||||||
|
if not words or img_h <= 0:
|
||||||
|
return None
|
||||||
|
footer_y = img_h * 0.95
|
||||||
|
footer_words = [
|
||||||
|
w for w in words
|
||||||
|
if w["top"] + w.get("height", 0) / 2 > footer_y
|
||||||
|
]
|
||||||
|
if not footer_words:
|
||||||
|
return None
|
||||||
|
# Only remove if footer has very few words (<= 3) with short text
|
||||||
|
total_text = "".join((w.get("text") or "").strip() for w in footer_words)
|
||||||
|
if len(footer_words) <= 3 and len(total_text) <= 10:
|
||||||
|
# Extract page number metadata before removing
|
||||||
|
page_number_info = {
|
||||||
|
"text": total_text.strip(),
|
||||||
|
"y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
|
||||||
|
}
|
||||||
|
# Try to parse as integer
|
||||||
|
digits = "".join(c for c in total_text if c.isdigit())
|
||||||
|
if digits:
|
||||||
|
page_number_info["number"] = int(digits)
|
||||||
|
|
||||||
|
footer_set = set(id(w) for w in footer_words)
|
||||||
|
words[:] = [w for w in words if id(w) not in footer_set]
|
||||||
|
log.info(
|
||||||
|
"build-grid session %s: extracted page number '%s' and removed %d footer words",
|
||||||
|
session_id, total_text, len(footer_words),
|
||||||
|
)
|
||||||
|
return page_number_info
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_header_junk(
|
||||||
|
words: List[Dict],
|
||||||
|
img_h: int,
|
||||||
|
log: Any,
|
||||||
|
session_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Remove OCR junk from header illustrations above the real content.
|
||||||
|
|
||||||
|
Textbook pages often have decorative header graphics (illustrations,
|
||||||
|
icons) that OCR reads as low-confidence junk characters. Real content
|
||||||
|
typically starts further down the page.
|
||||||
|
|
||||||
|
Algorithm:
|
||||||
|
1. Find the "content start" -- the first Y position where a dense
|
||||||
|
horizontal row of 3+ high-confidence words begins.
|
||||||
|
2. Above that line, remove words with conf < 75 and text <= 3 chars.
|
||||||
|
These are almost certainly OCR artifacts from illustrations.
|
||||||
|
|
||||||
|
Modifies *words* in place.
|
||||||
|
"""
|
||||||
|
if not words or img_h <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# --- Find content start: first horizontal row with >=3 high-conf words ---
|
||||||
|
# Sort words by Y
|
||||||
|
sorted_by_y = sorted(words, key=lambda w: w["top"])
|
||||||
|
content_start_y = 0
|
||||||
|
_ROW_TOLERANCE = img_h * 0.02 # words within 2% of page height = same row
|
||||||
|
_MIN_ROW_WORDS = 3
|
||||||
|
_MIN_CONF = 80
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(sorted_by_y):
|
||||||
|
row_y = sorted_by_y[i]["top"]
|
||||||
|
# Collect words in this row band
|
||||||
|
row_words = []
|
||||||
|
j = i
|
||||||
|
while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
|
||||||
|
row_words.append(sorted_by_y[j])
|
||||||
|
j += 1
|
||||||
|
# Count high-confidence words with real text (> 1 char)
|
||||||
|
high_conf = [
|
||||||
|
w for w in row_words
|
||||||
|
if w.get("conf", 0) >= _MIN_CONF
|
||||||
|
and len((w.get("text") or "").strip()) > 1
|
||||||
|
]
|
||||||
|
if len(high_conf) >= _MIN_ROW_WORDS:
|
||||||
|
content_start_y = row_y
|
||||||
|
break
|
||||||
|
i = j if j > i else i + 1
|
||||||
|
|
||||||
|
if content_start_y <= 0:
|
||||||
|
return # no clear content start found
|
||||||
|
|
||||||
|
# --- Remove low-conf short junk above content start ---
|
||||||
|
junk = [
|
||||||
|
w for w in words
|
||||||
|
if w["top"] + w.get("height", 0) < content_start_y
|
||||||
|
and w.get("conf", 0) < 75
|
||||||
|
and len((w.get("text") or "").strip()) <= 3
|
||||||
|
]
|
||||||
|
if not junk:
|
||||||
|
return
|
||||||
|
|
||||||
|
junk_set = set(id(w) for w in junk)
|
||||||
|
before = len(words)
|
||||||
|
words[:] = [w for w in words if id(w) not in junk_set]
|
||||||
|
removed = before - len(words)
|
||||||
|
if removed:
|
||||||
|
log.info(
|
||||||
|
"build-grid session %s: removed %d header junk words above y=%d "
|
||||||
|
"(content start)",
|
||||||
|
session_id, removed, content_start_y,
|
||||||
|
)
|
||||||
@@ -0,0 +1,499 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor — header/heading detection and colspan (merged cell) detection.
|
||||||
|
Split from grid_editor_helpers.py. Pure computation, no HTTP/DB side effects.
|
||||||
|
Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from cv_ocr_engines import _text_has_garbled_ipa
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
|
||||||
|
"""Detect heading rows by color + height after color annotation.
|
||||||
|
|
||||||
|
A row is a heading if:
|
||||||
|
1. ALL word_boxes have color_name != 'black' (typically 'blue')
|
||||||
|
2. Mean word height > 1.2x median height of all words in the zone
|
||||||
|
|
||||||
|
Detected heading rows are merged into a single spanning cell.
|
||||||
|
Returns count of headings detected.
|
||||||
|
"""
|
||||||
|
heading_count = 0
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
rows = z.get("rows", [])
|
||||||
|
columns = z.get("columns", [])
|
||||||
|
if not cells or not rows or len(columns) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute median word height across the zone
|
||||||
|
all_heights = []
|
||||||
|
for cell in cells:
|
||||||
|
for wb in cell.get("word_boxes") or []:
|
||||||
|
h = wb.get("height", 0)
|
||||||
|
if h > 0:
|
||||||
|
all_heights.append(h)
|
||||||
|
if not all_heights:
|
||||||
|
continue
|
||||||
|
all_heights_sorted = sorted(all_heights)
|
||||||
|
median_h = all_heights_sorted[len(all_heights_sorted) // 2]
|
||||||
|
|
||||||
|
heading_row_indices = []
|
||||||
|
for row in rows:
|
||||||
|
if row.get("is_header"):
|
||||||
|
continue # already detected as header
|
||||||
|
ri = row["index"]
|
||||||
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||||
|
row_wbs = [
|
||||||
|
wb for cell in row_cells
|
||||||
|
for wb in cell.get("word_boxes") or []
|
||||||
|
]
|
||||||
|
if not row_wbs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Condition 1: ALL words are non-black
|
||||||
|
all_colored = all(
|
||||||
|
wb.get("color_name", "black") != "black"
|
||||||
|
for wb in row_wbs
|
||||||
|
)
|
||||||
|
if not all_colored:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Condition 2: mean height > 1.2x median
|
||||||
|
mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
|
||||||
|
if mean_h <= median_h * 1.2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
heading_row_indices.append(ri)
|
||||||
|
|
||||||
|
# Merge heading cells into spanning cells
|
||||||
|
for hri in heading_row_indices:
|
||||||
|
header_cells = [c for c in cells if c.get("row_index") == hri]
|
||||||
|
if len(header_cells) <= 1:
|
||||||
|
# Single cell -- just mark it as heading
|
||||||
|
if header_cells:
|
||||||
|
header_cells[0]["col_type"] = "heading"
|
||||||
|
heading_count += 1
|
||||||
|
# Mark row as header
|
||||||
|
for row in rows:
|
||||||
|
if row["index"] == hri:
|
||||||
|
row["is_header"] = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Collect all word_boxes and text from all columns
|
||||||
|
all_wb = []
|
||||||
|
all_text_parts = []
|
||||||
|
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
||||||
|
all_wb.extend(hc.get("word_boxes", []))
|
||||||
|
if hc.get("text", "").strip():
|
||||||
|
all_text_parts.append(hc["text"].strip())
|
||||||
|
|
||||||
|
# Remove all cells for this row, replace with one spanning cell
|
||||||
|
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
|
||||||
|
|
||||||
|
if all_wb:
|
||||||
|
x_min = min(wb["left"] for wb in all_wb)
|
||||||
|
y_min = min(wb["top"] for wb in all_wb)
|
||||||
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||||
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||||
|
|
||||||
|
# Use the actual starting col_index from the first cell
|
||||||
|
first_col = min(hc["col_index"] for hc in header_cells)
|
||||||
|
zone_idx = z.get("zone_index", 0)
|
||||||
|
z["cells"].append({
|
||||||
|
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
|
||||||
|
"zone_index": zone_idx,
|
||||||
|
"row_index": hri,
|
||||||
|
"col_index": first_col,
|
||||||
|
"col_type": "heading",
|
||||||
|
"text": " ".join(all_text_parts),
|
||||||
|
"confidence": 0.0,
|
||||||
|
"bbox_px": {"x": x_min, "y": y_min,
|
||||||
|
"w": x_max - x_min, "h": y_max - y_min},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"word_boxes": all_wb,
|
||||||
|
"ocr_engine": "words_first",
|
||||||
|
"is_bold": True,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Mark row as header
|
||||||
|
for row in rows:
|
||||||
|
if row["index"] == hri:
|
||||||
|
row["is_header"] = True
|
||||||
|
heading_count += 1
|
||||||
|
|
||||||
|
return heading_count
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_heading_rows_by_single_cell(
|
||||||
|
zones_data: List[Dict], img_w: int, img_h: int,
|
||||||
|
) -> int:
|
||||||
|
"""Detect heading rows that have only a single content cell.
|
||||||
|
|
||||||
|
Black headings like "Theme" have normal color and height, so they are
|
||||||
|
missed by ``_detect_heading_rows_by_color``. The distinguishing signal
|
||||||
|
is that they occupy only one column while normal vocabulary rows fill
|
||||||
|
at least 2-3 columns.
|
||||||
|
|
||||||
|
A row qualifies as a heading if:
|
||||||
|
1. It is not already marked as a header/heading.
|
||||||
|
2. It has exactly ONE cell whose col_type starts with ``column_``
|
||||||
|
(excluding column_1 / page_ref which only carries page numbers).
|
||||||
|
3. That single cell is NOT in the last column (continuation/example
|
||||||
|
lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
|
||||||
|
4. The text does not start with ``[`` (IPA continuation).
|
||||||
|
5. The zone has >=3 columns and >=5 rows (avoids false positives in
|
||||||
|
tiny zones).
|
||||||
|
6. The majority of rows in the zone have >=2 content cells (ensures
|
||||||
|
we are in a multi-column vocab layout).
|
||||||
|
"""
|
||||||
|
heading_count = 0
|
||||||
|
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
rows = z.get("rows", [])
|
||||||
|
columns = z.get("columns", [])
|
||||||
|
if len(columns) < 3 or len(rows) < 5:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Determine the last col_index (example/sentence column)
|
||||||
|
col_indices = sorted(set(c.get("col_index", 0) for c in cells))
|
||||||
|
if not col_indices:
|
||||||
|
continue
|
||||||
|
last_col = col_indices[-1]
|
||||||
|
|
||||||
|
# Count content cells per row (column_* but not column_1/page_ref).
|
||||||
|
# Exception: column_1 cells that contain a dictionary article word
|
||||||
|
# (die/der/das etc.) ARE content -- they appear in dictionary layouts
|
||||||
|
# where the leftmost column holds grammatical articles.
|
||||||
|
_ARTICLE_WORDS = {
|
||||||
|
"die", "der", "das", "dem", "den", "des", "ein", "eine",
|
||||||
|
"the", "a", "an",
|
||||||
|
}
|
||||||
|
row_content_counts: Dict[int, int] = {}
|
||||||
|
for cell in cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
if ct == "column_1":
|
||||||
|
ctext = (cell.get("text") or "").strip().lower()
|
||||||
|
if ctext not in _ARTICLE_WORDS:
|
||||||
|
continue
|
||||||
|
ri = cell.get("row_index", -1)
|
||||||
|
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
|
||||||
|
|
||||||
|
# Majority of rows must have >=2 content cells
|
||||||
|
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
|
||||||
|
if multi_col_rows < len(rows) * 0.4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Exclude first and last non-header rows -- these are typically
|
||||||
|
# page numbers or footer text, not headings.
|
||||||
|
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||||
|
if len(non_header_rows) < 3:
|
||||||
|
continue
|
||||||
|
first_ri = non_header_rows[0]["index"]
|
||||||
|
last_ri = non_header_rows[-1]["index"]
|
||||||
|
|
||||||
|
heading_row_indices = []
|
||||||
|
for row in rows:
|
||||||
|
if row.get("is_header"):
|
||||||
|
continue
|
||||||
|
ri = row["index"]
|
||||||
|
if ri == first_ri or ri == last_ri:
|
||||||
|
continue
|
||||||
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||||
|
content_cells = [
|
||||||
|
c for c in row_cells
|
||||||
|
if c.get("col_type", "").startswith("column_")
|
||||||
|
and (c.get("col_type") != "column_1"
|
||||||
|
or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
|
||||||
|
]
|
||||||
|
if len(content_cells) != 1:
|
||||||
|
continue
|
||||||
|
cell = content_cells[0]
|
||||||
|
# Not in the last column (continuation/example lines)
|
||||||
|
if cell.get("col_index") == last_col:
|
||||||
|
continue
|
||||||
|
text = (cell.get("text") or "").strip()
|
||||||
|
if not text or text.startswith("["):
|
||||||
|
continue
|
||||||
|
# Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
|
||||||
|
if text.startswith("("):
|
||||||
|
continue
|
||||||
|
# Single cell NOT in the first content column is likely a
|
||||||
|
# continuation/overflow line, not a heading. Real headings
|
||||||
|
# ("Theme 1", "Unit 3: ...") appear in the first or second
|
||||||
|
# content column.
|
||||||
|
first_content_col = col_indices[0] if col_indices else 0
|
||||||
|
if cell.get("col_index", 0) > first_content_col + 1:
|
||||||
|
continue
|
||||||
|
# Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
|
||||||
|
# but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
|
||||||
|
_REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
|
||||||
|
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
|
||||||
|
continue
|
||||||
|
# Guard: dictionary section headings are short (1-4 alpha chars
|
||||||
|
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
|
||||||
|
# lowercase is a regular vocabulary word (e.g. "zentral") that
|
||||||
|
# happens to appear alone in its row.
|
||||||
|
alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
|
||||||
|
if len(alpha_only) > 4 and text[0].islower():
|
||||||
|
continue
|
||||||
|
heading_row_indices.append(ri)
|
||||||
|
|
||||||
|
# Guard: if >25% of eligible rows would become headings, the
|
||||||
|
# heuristic is misfiring (e.g. sparse single-column layout where
|
||||||
|
# most rows naturally have only 1 content cell).
|
||||||
|
eligible_rows = len(non_header_rows) - 2 # minus first/last excluded
|
||||||
|
if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
|
||||||
|
logger.debug(
|
||||||
|
"Skipping single-cell heading detection for zone %s: "
|
||||||
|
"%d/%d rows would be headings (>25%%)",
|
||||||
|
z.get("zone_index"), len(heading_row_indices), eligible_rows,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for hri in heading_row_indices:
|
||||||
|
header_cells = [c for c in cells if c.get("row_index") == hri]
|
||||||
|
if not header_cells:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Collect all word_boxes and text
|
||||||
|
all_wb = []
|
||||||
|
all_text_parts = []
|
||||||
|
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
||||||
|
all_wb.extend(hc.get("word_boxes", []))
|
||||||
|
if hc.get("text", "").strip():
|
||||||
|
all_text_parts.append(hc["text"].strip())
|
||||||
|
|
||||||
|
first_col_idx = min(hc["col_index"] for hc in header_cells)
|
||||||
|
|
||||||
|
# Remove old cells for this row, add spanning heading cell
|
||||||
|
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
|
||||||
|
|
||||||
|
if all_wb:
|
||||||
|
x_min = min(wb["left"] for wb in all_wb)
|
||||||
|
y_min = min(wb["top"] for wb in all_wb)
|
||||||
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||||
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||||
|
else:
|
||||||
|
# Fallback to first cell bbox
|
||||||
|
bp = header_cells[0].get("bbox_px", {})
|
||||||
|
x_min = bp.get("x", 0)
|
||||||
|
y_min = bp.get("y", 0)
|
||||||
|
x_max = x_min + bp.get("w", 0)
|
||||||
|
y_max = y_min + bp.get("h", 0)
|
||||||
|
|
||||||
|
zone_idx = z.get("zone_index", 0)
|
||||||
|
z["cells"].append({
|
||||||
|
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
|
||||||
|
"zone_index": zone_idx,
|
||||||
|
"row_index": hri,
|
||||||
|
"col_index": first_col_idx,
|
||||||
|
"col_type": "heading",
|
||||||
|
"text": " ".join(all_text_parts),
|
||||||
|
"confidence": 0.0,
|
||||||
|
"bbox_px": {"x": x_min, "y": y_min,
|
||||||
|
"w": x_max - x_min, "h": y_max - y_min},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"word_boxes": all_wb,
|
||||||
|
"ocr_engine": "words_first",
|
||||||
|
"is_bold": False,
|
||||||
|
})
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
if row["index"] == hri:
|
||||||
|
row["is_header"] = True
|
||||||
|
heading_count += 1
|
||||||
|
|
||||||
|
return heading_count
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_header_rows(
|
||||||
|
rows: List[Dict],
|
||||||
|
zone_words: List[Dict],
|
||||||
|
zone_y: int,
|
||||||
|
columns: Optional[List[Dict]] = None,
|
||||||
|
skip_first_row_header: bool = False,
|
||||||
|
) -> List[int]:
|
||||||
|
"""Detect header rows: first-row heuristic + spanning header detection.
|
||||||
|
|
||||||
|
A "spanning header" is a row whose words stretch across multiple column
|
||||||
|
boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
|
||||||
|
"""
|
||||||
|
if len(rows) < 2:
|
||||||
|
return []
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
if not skip_first_row_header:
|
||||||
|
first_row = rows[0]
|
||||||
|
second_row = rows[1]
|
||||||
|
|
||||||
|
# Gap between first and second row > 0.5x average row height
|
||||||
|
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
||||||
|
gap = second_row["y_min"] - first_row["y_max"]
|
||||||
|
if gap > avg_h * 0.5:
|
||||||
|
headers.append(0)
|
||||||
|
|
||||||
|
# Also check if first row words are taller than average (bold/header text)
|
||||||
|
all_heights = [w["height"] for w in zone_words]
|
||||||
|
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
||||||
|
first_row_words = [
|
||||||
|
w for w in zone_words
|
||||||
|
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
||||||
|
]
|
||||||
|
if first_row_words:
|
||||||
|
first_h = max(w["height"] for w in first_row_words)
|
||||||
|
if first_h > median_h * 1.3:
|
||||||
|
if 0 not in headers:
|
||||||
|
headers.append(0)
|
||||||
|
|
||||||
|
# Note: Spanning-header detection (rows spanning all columns) has been
|
||||||
|
# disabled because it produces too many false positives on vocabulary
|
||||||
|
# worksheets where IPA transcriptions or short entries naturally span
|
||||||
|
# multiple columns with few words. The first-row heuristic above is
|
||||||
|
# sufficient for detecting real headers.
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_colspan_cells(
|
||||||
|
zone_words: List[Dict],
|
||||||
|
columns: List[Dict],
|
||||||
|
rows: List[Dict],
|
||||||
|
cells: List[Dict],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Detect and merge cells that span multiple columns (colspan).
|
||||||
|
|
||||||
|
A word-block (PaddleOCR phrase) that extends significantly past a column
|
||||||
|
boundary into the next column indicates a merged cell. This replaces
|
||||||
|
the incorrectly split cells with a single cell spanning multiple columns.
|
||||||
|
|
||||||
|
Works for both full-page scans and box zones.
|
||||||
|
"""
|
||||||
|
if len(columns) < 2 or not zone_words or not rows:
|
||||||
|
return cells
|
||||||
|
|
||||||
|
from cv_words_first import _assign_word_to_row
|
||||||
|
|
||||||
|
# Column boundaries (midpoints between adjacent columns)
|
||||||
|
col_boundaries = []
|
||||||
|
for ci in range(len(columns) - 1):
|
||||||
|
col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
|
||||||
|
|
||||||
|
def _cols_covered(w_left: float, w_right: float) -> List[int]:
|
||||||
|
"""Return list of column indices that a word-block covers."""
|
||||||
|
covered = []
|
||||||
|
for col in columns:
|
||||||
|
col_mid = (col["x_min"] + col["x_max"]) / 2
|
||||||
|
# Word covers a column if it extends past the column's midpoint
|
||||||
|
if w_left < col_mid < w_right:
|
||||||
|
covered.append(col["index"])
|
||||||
|
# Also include column if word starts within it
|
||||||
|
elif col["x_min"] <= w_left < col["x_max"]:
|
||||||
|
covered.append(col["index"])
|
||||||
|
return sorted(set(covered))
|
||||||
|
|
||||||
|
# Group original word-blocks by row
|
||||||
|
row_word_blocks: Dict[int, List[Dict]] = {}
|
||||||
|
for w in zone_words:
|
||||||
|
ri = _assign_word_to_row(w, rows)
|
||||||
|
row_word_blocks.setdefault(ri, []).append(w)
|
||||||
|
|
||||||
|
# For each row, check if any word-block spans multiple columns
|
||||||
|
rows_to_merge: Dict[int, List[Dict]] = {} # row_index -> list of spanning word-blocks
|
||||||
|
|
||||||
|
for ri, wblocks in row_word_blocks.items():
|
||||||
|
spanning = []
|
||||||
|
for w in wblocks:
|
||||||
|
w_left = w["left"]
|
||||||
|
w_right = w_left + w["width"]
|
||||||
|
covered = _cols_covered(w_left, w_right)
|
||||||
|
if len(covered) >= 2:
|
||||||
|
spanning.append({"word": w, "cols": covered})
|
||||||
|
if spanning:
|
||||||
|
rows_to_merge[ri] = spanning
|
||||||
|
|
||||||
|
if not rows_to_merge:
|
||||||
|
return cells
|
||||||
|
|
||||||
|
# Merge cells for spanning rows
|
||||||
|
new_cells = []
|
||||||
|
for cell in cells:
|
||||||
|
ri = cell.get("row_index", -1)
|
||||||
|
if ri not in rows_to_merge:
|
||||||
|
new_cells.append(cell)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this cell's column is part of a spanning block
|
||||||
|
ci = cell.get("col_index", -1)
|
||||||
|
is_part_of_span = False
|
||||||
|
for span in rows_to_merge[ri]:
|
||||||
|
if ci in span["cols"]:
|
||||||
|
is_part_of_span = True
|
||||||
|
# Only emit the merged cell for the FIRST column in the span
|
||||||
|
if ci == span["cols"][0]:
|
||||||
|
# Use the ORIGINAL word-block text (not the split cell texts
|
||||||
|
# which may have broken words like "euros a" + "nd cents")
|
||||||
|
orig_word = span["word"]
|
||||||
|
merged_text = orig_word.get("text", "").strip()
|
||||||
|
all_wb = [orig_word]
|
||||||
|
|
||||||
|
# Compute merged bbox
|
||||||
|
if all_wb:
|
||||||
|
x_min = min(wb["left"] for wb in all_wb)
|
||||||
|
y_min = min(wb["top"] for wb in all_wb)
|
||||||
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||||
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||||
|
else:
|
||||||
|
x_min = y_min = x_max = y_max = 0
|
||||||
|
|
||||||
|
new_cells.append({
|
||||||
|
"cell_id": cell["cell_id"],
|
||||||
|
"row_index": ri,
|
||||||
|
"col_index": span["cols"][0],
|
||||||
|
"col_type": "spanning_header",
|
||||||
|
"colspan": len(span["cols"]),
|
||||||
|
"text": merged_text,
|
||||||
|
"confidence": cell.get("confidence", 0),
|
||||||
|
"bbox_px": {"x": x_min, "y": y_min,
|
||||||
|
"w": x_max - x_min, "h": y_max - y_min},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"word_boxes": all_wb,
|
||||||
|
"ocr_engine": cell.get("ocr_engine", ""),
|
||||||
|
"is_bold": cell.get("is_bold", False),
|
||||||
|
})
|
||||||
|
logger.info(
|
||||||
|
"colspan detected: row %d, cols %s -> merged %d cells (%r)",
|
||||||
|
ri, span["cols"], len(span["cols"]), merged_text[:50],
|
||||||
|
)
|
||||||
|
break
|
||||||
|
if not is_part_of_span:
|
||||||
|
new_cells.append(cell)
|
||||||
|
|
||||||
|
return new_cells
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor helper functions — barrel re-export module.
|
||||||
|
|
||||||
|
This file re-exports all public symbols from the split sub-modules
|
||||||
|
so that existing ``from grid_editor_helpers import ...`` statements
|
||||||
|
continue to work without changes.
|
||||||
|
|
||||||
|
Sub-modules:
|
||||||
|
- columns — column detection, cross-column splitting, marker merging
|
||||||
|
- filters — word/zone filtering, border ghosts, decorative margins
|
||||||
|
- headers — header/heading detection, colspan detection
|
||||||
|
- zones — vertical dividers, zone splitting/merging, zone grid building
|
||||||
|
|
||||||
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||||
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# --- Re-export: columns ---------------------------------------------------
|
||||||
|
from .columns import ( # noqa: F401
|
||||||
|
_is_recognized_word,
|
||||||
|
_split_cross_column_words,
|
||||||
|
_cluster_columns_by_alignment,
|
||||||
|
_MARKER_CHARS,
|
||||||
|
_merge_inline_marker_columns,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Re-export: filters ----------------------------------------------------
|
||||||
|
from .filters import ( # noqa: F401
|
||||||
|
_filter_border_strip_words,
|
||||||
|
_GRID_GHOST_CHARS,
|
||||||
|
_filter_border_ghosts,
|
||||||
|
_flatten_word_boxes,
|
||||||
|
_words_in_zone,
|
||||||
|
_get_content_bounds,
|
||||||
|
_filter_decorative_margin,
|
||||||
|
_filter_footer_words,
|
||||||
|
_filter_header_junk,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Re-export: headers ----------------------------------------------------
|
||||||
|
from .headers import ( # noqa: F401
|
||||||
|
_detect_heading_rows_by_color,
|
||||||
|
_detect_heading_rows_by_single_cell,
|
||||||
|
_detect_header_rows,
|
||||||
|
_detect_colspan_cells,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Re-export: zones -------------------------------------------------------
|
||||||
|
from .zones import ( # noqa: F401
|
||||||
|
_PIPE_RE_VSPLIT,
|
||||||
|
_detect_vertical_dividers,
|
||||||
|
_split_zone_at_vertical_dividers,
|
||||||
|
_merge_content_zones_across_boxes,
|
||||||
|
_build_zone_grid,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Re-export from cv_words_first (used by cv_box_layout.py) ---------------
|
||||||
|
from cv_words_first import _cluster_rows # noqa: F401
|
||||||
@@ -0,0 +1,389 @@
|
|||||||
|
"""
|
||||||
|
Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
|
||||||
|
|
||||||
|
Split from grid_editor_helpers.py for maintainability.
|
||||||
|
All functions are pure computation — no HTTP, DB, or session side effects.
|
||||||
|
|
||||||
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||||
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from cv_vocab_types import PageZone
|
||||||
|
from cv_words_first import _cluster_rows, _build_cells
|
||||||
|
|
||||||
|
from .columns import (
|
||||||
|
_cluster_columns_by_alignment,
|
||||||
|
_merge_inline_marker_columns,
|
||||||
|
_split_cross_column_words,
|
||||||
|
)
|
||||||
|
from .headers import (
|
||||||
|
_detect_header_rows,
|
||||||
|
_detect_colspan_cells,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Vertical divider detection and zone splitting
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_vertical_dividers(
|
||||||
|
words: List[Dict],
|
||||||
|
zone_x: int,
|
||||||
|
zone_w: int,
|
||||||
|
zone_y: int,
|
||||||
|
zone_h: int,
|
||||||
|
) -> List[float]:
|
||||||
|
"""Detect vertical divider lines from pipe word_boxes at consistent x.
|
||||||
|
|
||||||
|
Returns list of divider x-positions (empty if no dividers found).
|
||||||
|
"""
|
||||||
|
if not words or zone_w <= 0 or zone_h <= 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Collect pipe word_boxes
|
||||||
|
pipes = [
|
||||||
|
w for w in words
|
||||||
|
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
||||||
|
]
|
||||||
|
if len(pipes) < 5:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Cluster pipe x-centers by proximity
|
||||||
|
tolerance = max(15, int(zone_w * 0.02))
|
||||||
|
pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
|
||||||
|
|
||||||
|
clusters: List[List[float]] = [[pipe_xs[0]]]
|
||||||
|
for x in pipe_xs[1:]:
|
||||||
|
if x - clusters[-1][-1] <= tolerance:
|
||||||
|
clusters[-1].append(x)
|
||||||
|
else:
|
||||||
|
clusters.append([x])
|
||||||
|
|
||||||
|
dividers: List[float] = []
|
||||||
|
for cluster in clusters:
|
||||||
|
if len(cluster) < 5:
|
||||||
|
continue
|
||||||
|
mean_x = sum(cluster) / len(cluster)
|
||||||
|
# Must be between 15% and 85% of zone width
|
||||||
|
rel_pos = (mean_x - zone_x) / zone_w
|
||||||
|
if rel_pos < 0.15 or rel_pos > 0.85:
|
||||||
|
continue
|
||||||
|
# Check vertical coverage: pipes must span >= 50% of zone height
|
||||||
|
cluster_pipes = [
|
||||||
|
w for w in pipes
|
||||||
|
if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
|
||||||
|
]
|
||||||
|
ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
|
||||||
|
y_span = max(ys) - min(ys) if ys else 0
|
||||||
|
if y_span < zone_h * 0.5:
|
||||||
|
continue
|
||||||
|
dividers.append(mean_x)
|
||||||
|
|
||||||
|
return sorted(dividers)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_zone_at_vertical_dividers(
|
||||||
|
zone: "PageZone",
|
||||||
|
divider_xs: List[float],
|
||||||
|
vsplit_group_id: int,
|
||||||
|
) -> List["PageZone"]:
|
||||||
|
"""Split a PageZone at vertical divider positions into sub-zones."""
|
||||||
|
boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
|
||||||
|
hints = []
|
||||||
|
for i in range(len(boundaries) - 1):
|
||||||
|
if i == 0:
|
||||||
|
hints.append("left_of_vsplit")
|
||||||
|
elif i == len(boundaries) - 2:
|
||||||
|
hints.append("right_of_vsplit")
|
||||||
|
else:
|
||||||
|
hints.append("middle_of_vsplit")
|
||||||
|
|
||||||
|
sub_zones = []
|
||||||
|
for i in range(len(boundaries) - 1):
|
||||||
|
x_start = int(boundaries[i])
|
||||||
|
x_end = int(boundaries[i + 1])
|
||||||
|
sub = PageZone(
|
||||||
|
index=0, # re-indexed later
|
||||||
|
zone_type=zone.zone_type,
|
||||||
|
y=zone.y,
|
||||||
|
height=zone.height,
|
||||||
|
x=x_start,
|
||||||
|
width=x_end - x_start,
|
||||||
|
box=zone.box,
|
||||||
|
image_overlays=zone.image_overlays,
|
||||||
|
layout_hint=hints[i],
|
||||||
|
vsplit_group=vsplit_group_id,
|
||||||
|
)
|
||||||
|
sub_zones.append(sub)
|
||||||
|
|
||||||
|
return sub_zones
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_content_zones_across_boxes(
|
||||||
|
zones: List,
|
||||||
|
content_x: int,
|
||||||
|
content_w: int,
|
||||||
|
) -> List:
|
||||||
|
"""Merge content zones separated by box zones into single zones.
|
||||||
|
|
||||||
|
Box zones become image_overlays on the merged content zone.
|
||||||
|
Pattern: [content, box*, content] -> [merged_content with overlay]
|
||||||
|
Box zones NOT between two content zones stay as standalone zones.
|
||||||
|
"""
|
||||||
|
if len(zones) < 3:
|
||||||
|
return zones
|
||||||
|
|
||||||
|
# Group consecutive runs of [content, box+, content]
|
||||||
|
result: List = []
|
||||||
|
i = 0
|
||||||
|
while i < len(zones):
|
||||||
|
z = zones[i]
|
||||||
|
if z.zone_type != "content":
|
||||||
|
result.append(z)
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Start of a potential merge group: content zone
|
||||||
|
group_contents = [z]
|
||||||
|
group_boxes = []
|
||||||
|
j = i + 1
|
||||||
|
# Absorb [box, content] pairs -- only absorb a box if it's
|
||||||
|
# confirmed to be followed by another content zone.
|
||||||
|
while j < len(zones):
|
||||||
|
if (zones[j].zone_type == "box"
|
||||||
|
and j + 1 < len(zones)
|
||||||
|
and zones[j + 1].zone_type == "content"):
|
||||||
|
group_boxes.append(zones[j])
|
||||||
|
group_contents.append(zones[j + 1])
|
||||||
|
j += 2
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(group_contents) >= 2 and group_boxes:
|
||||||
|
# Merge: create one large content zone spanning all
|
||||||
|
y_min = min(c.y for c in group_contents)
|
||||||
|
y_max = max(c.y + c.height for c in group_contents)
|
||||||
|
overlays = []
|
||||||
|
for bz in group_boxes:
|
||||||
|
overlay = {
|
||||||
|
"y": bz.y,
|
||||||
|
"height": bz.height,
|
||||||
|
"x": bz.x,
|
||||||
|
"width": bz.width,
|
||||||
|
}
|
||||||
|
if bz.box:
|
||||||
|
overlay["box"] = {
|
||||||
|
"x": bz.box.x,
|
||||||
|
"y": bz.box.y,
|
||||||
|
"width": bz.box.width,
|
||||||
|
"height": bz.box.height,
|
||||||
|
"confidence": bz.box.confidence,
|
||||||
|
"border_thickness": bz.box.border_thickness,
|
||||||
|
}
|
||||||
|
overlays.append(overlay)
|
||||||
|
|
||||||
|
merged = PageZone(
|
||||||
|
index=0, # re-indexed below
|
||||||
|
zone_type="content",
|
||||||
|
y=y_min,
|
||||||
|
height=y_max - y_min,
|
||||||
|
x=content_x,
|
||||||
|
width=content_w,
|
||||||
|
image_overlays=overlays,
|
||||||
|
)
|
||||||
|
result.append(merged)
|
||||||
|
i = j
|
||||||
|
else:
|
||||||
|
# No merge possible -- emit just the content zone
|
||||||
|
result.append(z)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Re-index zones
|
||||||
|
for idx, z in enumerate(result):
|
||||||
|
z.index = idx
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"zone-merge: %d zones -> %d zones after merging across boxes",
|
||||||
|
len(zones), len(result),
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _build_zone_grid(
|
||||||
|
zone_words: List[Dict],
|
||||||
|
zone_x: int,
|
||||||
|
zone_y: int,
|
||||||
|
zone_w: int,
|
||||||
|
zone_h: int,
|
||||||
|
zone_index: int,
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
global_columns: Optional[List[Dict]] = None,
|
||||||
|
skip_first_row_header: bool = False,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Build columns, rows, cells for a single zone from its words.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
global_columns: If provided, use these pre-computed column boundaries
|
||||||
|
instead of detecting columns per zone. Used for content zones so
|
||||||
|
that all content zones (above/between/below boxes) share the same
|
||||||
|
column structure. Box zones always detect columns independently.
|
||||||
|
"""
|
||||||
|
if not zone_words:
|
||||||
|
return {
|
||||||
|
"columns": [],
|
||||||
|
"rows": [],
|
||||||
|
"cells": [],
|
||||||
|
"header_rows": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cluster rows first (needed for column alignment analysis)
|
||||||
|
rows = _cluster_rows(zone_words)
|
||||||
|
|
||||||
|
# Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
|
||||||
|
if len(zone_words) <= 60:
|
||||||
|
import statistics as _st
|
||||||
|
_heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
|
||||||
|
_med_h = _st.median(_heights) if _heights else 20
|
||||||
|
_y_tol = max(_med_h * 0.5, 5)
|
||||||
|
logger.info(
|
||||||
|
"zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
|
||||||
|
zone_index, len(zone_words), _med_h, _y_tol, len(rows),
|
||||||
|
)
|
||||||
|
for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
|
||||||
|
logger.info(
|
||||||
|
" zone %d word: y=%d x=%d h=%d w=%d '%s'",
|
||||||
|
zone_index, w['top'], w['left'], w['height'], w['width'],
|
||||||
|
w.get('text', '')[:40],
|
||||||
|
)
|
||||||
|
for r in rows:
|
||||||
|
logger.info(
|
||||||
|
" zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
|
||||||
|
zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use global columns if provided, otherwise detect per zone
|
||||||
|
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
||||||
|
|
||||||
|
# Merge inline marker columns (bullets, numbering) into adjacent text
|
||||||
|
if not global_columns:
|
||||||
|
columns = _merge_inline_marker_columns(columns, zone_words)
|
||||||
|
|
||||||
|
if not columns or not rows:
|
||||||
|
return {
|
||||||
|
"columns": [],
|
||||||
|
"rows": [],
|
||||||
|
"cells": [],
|
||||||
|
"header_rows": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Split word boxes that straddle column boundaries (e.g. "sichzie"
|
||||||
|
# spanning Col 1 + Col 2). Must happen after column detection and
|
||||||
|
# before cell assignment.
|
||||||
|
# Keep original words for colspan detection (split destroys span info).
|
||||||
|
original_zone_words = zone_words
|
||||||
|
if len(columns) >= 2:
|
||||||
|
zone_words = _split_cross_column_words(zone_words, columns)
|
||||||
|
|
||||||
|
# Build cells
|
||||||
|
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
||||||
|
|
||||||
|
# --- Detect colspan (merged cells spanning multiple columns) ---
|
||||||
|
# Uses the ORIGINAL (pre-split) words to detect word-blocks that span
|
||||||
|
# multiple columns. _split_cross_column_words would have destroyed
|
||||||
|
# this information by cutting words at column boundaries.
|
||||||
|
if len(columns) >= 2:
|
||||||
|
cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
|
||||||
|
|
||||||
|
# Prefix cell IDs with zone index
|
||||||
|
for cell in cells:
|
||||||
|
cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
|
||||||
|
cell["zone_index"] = zone_index
|
||||||
|
|
||||||
|
# Detect header rows (pass columns for spanning header detection)
|
||||||
|
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
|
||||||
|
skip_first_row_header=skip_first_row_header)
|
||||||
|
|
||||||
|
# Merge cells in spanning header rows into a single col-0 cell
|
||||||
|
if header_rows and len(columns) >= 2:
|
||||||
|
for hri in header_rows:
|
||||||
|
header_cells = [c for c in cells if c["row_index"] == hri]
|
||||||
|
if len(header_cells) <= 1:
|
||||||
|
continue
|
||||||
|
# Collect all word_boxes and text from all columns
|
||||||
|
all_wb = []
|
||||||
|
all_text_parts = []
|
||||||
|
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
||||||
|
all_wb.extend(hc.get("word_boxes", []))
|
||||||
|
if hc.get("text", "").strip():
|
||||||
|
all_text_parts.append(hc["text"].strip())
|
||||||
|
# Remove all header cells, replace with one spanning cell
|
||||||
|
cells = [c for c in cells if c["row_index"] != hri]
|
||||||
|
if all_wb:
|
||||||
|
x_min = min(wb["left"] for wb in all_wb)
|
||||||
|
y_min = min(wb["top"] for wb in all_wb)
|
||||||
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||||
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||||
|
cells.append({
|
||||||
|
"cell_id": f"R{hri:02d}_C0",
|
||||||
|
"row_index": hri,
|
||||||
|
"col_index": 0,
|
||||||
|
"col_type": "spanning_header",
|
||||||
|
"text": " ".join(all_text_parts),
|
||||||
|
"confidence": 0.0,
|
||||||
|
"bbox_px": {"x": x_min, "y": y_min,
|
||||||
|
"w": x_max - x_min, "h": y_max - y_min},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"word_boxes": all_wb,
|
||||||
|
"ocr_engine": "words_first",
|
||||||
|
"is_bold": True,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Convert columns to output format with percentages
|
||||||
|
out_columns = []
|
||||||
|
for col in columns:
|
||||||
|
x_min = col["x_min"]
|
||||||
|
x_max = col["x_max"]
|
||||||
|
out_columns.append({
|
||||||
|
"index": col["index"],
|
||||||
|
"label": col["type"],
|
||||||
|
"x_min_px": round(x_min),
|
||||||
|
"x_max_px": round(x_max),
|
||||||
|
"x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||||
|
"x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
|
||||||
|
"bold": False,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Convert rows to output format with percentages
|
||||||
|
out_rows = []
|
||||||
|
for row in rows:
|
||||||
|
out_rows.append({
|
||||||
|
"index": row["index"],
|
||||||
|
"y_min_px": round(row["y_min"]),
|
||||||
|
"y_max_px": round(row["y_max"]),
|
||||||
|
"y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
|
||||||
|
"y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
|
||||||
|
"is_header": row["index"] in header_rows,
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"columns": out_columns,
|
||||||
|
"rows": out_rows,
|
||||||
|
"cells": cells,
|
||||||
|
"header_rows": header_rows,
|
||||||
|
"_raw_columns": columns, # internal: for propagation to other zones
|
||||||
|
}
|
||||||
@@ -1,305 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/build/cell_ops.py
|
||||||
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
|
import importlib as _importlib
|
||||||
garbled cell cleanup, word-box reordering, and max_columns enforcement.
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("grid.build.cell_ops")
|
||||||
Extracted from grid_build_core.py for maintainability.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import Any, Dict, List, Tuple
|
|
||||||
|
|
||||||
from cv_ocr_engines import (
|
|
||||||
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove blue bullet/artifact word_boxes (Step 5i).
|
|
||||||
|
|
||||||
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
|
|
||||||
and syllable-split word merging.
|
|
||||||
"""
|
|
||||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
|
||||||
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
|
|
||||||
|
|
||||||
bullet_removed = 0
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
wbs = cell.get("word_boxes") or []
|
|
||||||
if len(wbs) < 2:
|
|
||||||
continue
|
|
||||||
to_remove: set = set()
|
|
||||||
|
|
||||||
# Rule (a): tiny coloured symbols
|
|
||||||
for i, wb in enumerate(wbs):
|
|
||||||
cn = wb.get("color_name", "black")
|
|
||||||
if (cn != "black"
|
|
||||||
and wb.get("width", 0) * wb.get("height", 0) < 200
|
|
||||||
and wb.get("conf", 100) < 85):
|
|
||||||
to_remove.add(i)
|
|
||||||
|
|
||||||
# Rule (a2): isolated non-alphanumeric symbols
|
|
||||||
for i, wb in enumerate(wbs):
|
|
||||||
t = (wb.get("text") or "").strip()
|
|
||||||
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
|
||||||
if t in _REMOVE_SYMBOLS:
|
|
||||||
to_remove.add(i)
|
|
||||||
|
|
||||||
# Rule (b) + (c): overlap and duplicate detection
|
|
||||||
to_merge: List[Tuple[int, int]] = []
|
|
||||||
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
|
|
||||||
for p in range(len(indexed) - 1):
|
|
||||||
i1, w1 = indexed[p]
|
|
||||||
i2, w2 = indexed[p + 1]
|
|
||||||
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
|
|
||||||
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
|
|
||||||
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
|
|
||||||
min_w = min(w1.get("width", 1), w2.get("width", 1))
|
|
||||||
gap = x2s - x1e
|
|
||||||
overlap_pct = overlap / min_w if min_w > 0 else 0
|
|
||||||
|
|
||||||
if overlap_pct > 0.20:
|
|
||||||
t1 = (w1.get("text") or "").strip()
|
|
||||||
t2 = (w2.get("text") or "").strip()
|
|
||||||
|
|
||||||
# Syllable-split words
|
|
||||||
if (overlap_pct <= 0.75
|
|
||||||
and _ALPHA_WORD_RE.match(t1)
|
|
||||||
and _ALPHA_WORD_RE.match(t2)):
|
|
||||||
to_merge.append((i1, i2))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# High overlap with short prefix
|
|
||||||
if (overlap_pct > 0.75
|
|
||||||
and _ALPHA_WORD_RE.match(t1)
|
|
||||||
and _ALPHA_WORD_RE.match(t2)
|
|
||||||
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
|
|
||||||
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
|
|
||||||
to_merge.append((i1, i2))
|
|
||||||
continue
|
|
||||||
|
|
||||||
if overlap_pct <= 0.40:
|
|
||||||
continue
|
|
||||||
|
|
||||||
c1 = w1.get("conf", 50)
|
|
||||||
c2 = w2.get("conf", 50)
|
|
||||||
|
|
||||||
# Very high overlap: prefer IPA-dictionary word
|
|
||||||
if overlap_pct > 0.90 and t1.lower() != t2.lower():
|
|
||||||
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
|
|
||||||
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
|
|
||||||
if in_dict_1 and not in_dict_2:
|
|
||||||
to_remove.add(i2)
|
|
||||||
continue
|
|
||||||
elif in_dict_2 and not in_dict_1:
|
|
||||||
to_remove.add(i1)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if c1 < c2:
|
|
||||||
to_remove.add(i1)
|
|
||||||
elif c2 < c1:
|
|
||||||
to_remove.add(i2)
|
|
||||||
else:
|
|
||||||
if w1.get("height", 0) > w2.get("height", 0):
|
|
||||||
to_remove.add(i1)
|
|
||||||
else:
|
|
||||||
to_remove.add(i2)
|
|
||||||
|
|
||||||
elif (gap < 6
|
|
||||||
and w1.get("color_name") == "blue"
|
|
||||||
and w2.get("color_name") == "blue"
|
|
||||||
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
|
|
||||||
c1 = w1.get("conf", 50)
|
|
||||||
c2 = w2.get("conf", 50)
|
|
||||||
to_remove.add(i1 if c1 <= c2 else i2)
|
|
||||||
|
|
||||||
# Execute merges first (syllable-split words)
|
|
||||||
if to_merge:
|
|
||||||
merge_parent: Dict[int, int] = {}
|
|
||||||
for mi1, mi2 in to_merge:
|
|
||||||
actual_mi1 = mi1
|
|
||||||
while actual_mi1 in merge_parent:
|
|
||||||
actual_mi1 = merge_parent[actual_mi1]
|
|
||||||
if actual_mi1 in to_remove or mi2 in to_remove:
|
|
||||||
continue
|
|
||||||
if mi2 in merge_parent:
|
|
||||||
continue
|
|
||||||
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
|
|
||||||
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
|
||||||
mt2 = (mw2.get("text") or "").strip()
|
|
||||||
merged_text = mt1 + mt2
|
|
||||||
mx = min(mw1["left"], mw2["left"])
|
|
||||||
my = min(mw1["top"], mw2["top"])
|
|
||||||
mr = max(mw1["left"] + mw1["width"],
|
|
||||||
mw2["left"] + mw2["width"])
|
|
||||||
mb = max(mw1["top"] + mw1["height"],
|
|
||||||
mw2["top"] + mw2["height"])
|
|
||||||
mw1["text"] = merged_text
|
|
||||||
mw1["left"] = mx
|
|
||||||
mw1["top"] = my
|
|
||||||
mw1["width"] = mr - mx
|
|
||||||
mw1["height"] = mb - my
|
|
||||||
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
|
||||||
to_remove.add(mi2)
|
|
||||||
merge_parent[mi2] = actual_mi1
|
|
||||||
bullet_removed -= 1
|
|
||||||
|
|
||||||
if to_remove:
|
|
||||||
bullet_removed += len(to_remove)
|
|
||||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
|
||||||
cell["word_boxes"] = filtered
|
|
||||||
if not cell.get("_ipa_corrected"):
|
|
||||||
cell["text"] = _words_to_reading_order_text(filtered)
|
|
||||||
|
|
||||||
if bullet_removed:
|
|
||||||
for z in zones_data:
|
|
||||||
z["cells"] = [c for c in z.get("cells", [])
|
|
||||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
|
||||||
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
|
|
||||||
_COMMON_SHORT_WORDS = {
|
|
||||||
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
|
|
||||||
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
|
|
||||||
"die", "der", "das", "dem", "den", "des", "ein", "und",
|
|
||||||
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
|
|
||||||
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
|
|
||||||
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
|
|
||||||
"on", "or", "so", "to", "up", "us", "we",
|
|
||||||
"the", "and", "but", "for", "not",
|
|
||||||
}
|
|
||||||
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
|
|
||||||
artifact_cells_removed = 0
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
before = len(z.get("cells", []))
|
|
||||||
kept = []
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
text = (cell.get("text") or "").strip()
|
|
||||||
core = text.rstrip(".,;:!?'\"")
|
|
||||||
is_artifact = False
|
|
||||||
if not core:
|
|
||||||
is_artifact = True
|
|
||||||
elif _PURE_JUNK_RE.match(core):
|
|
||||||
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
|
|
||||||
is_artifact = True
|
|
||||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
|
||||||
is_artifact = True
|
|
||||||
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
|
||||||
is_artifact = True
|
|
||||||
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
|
|
||||||
and not re.match(r'^[pPsS]\.?\d+$', core)):
|
|
||||||
is_artifact = True
|
|
||||||
if is_artifact:
|
|
||||||
kept.append(None)
|
|
||||||
else:
|
|
||||||
kept.append(cell)
|
|
||||||
z["cells"] = [c for c in kept if c is not None]
|
|
||||||
artifact_cells_removed += before - len(z["cells"])
|
|
||||||
|
|
||||||
if artifact_cells_removed:
|
|
||||||
for z in zones_data:
|
|
||||||
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
|
|
||||||
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
|
|
||||||
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Normalise word_box order to reading order (Step 5j)."""
|
|
||||||
wb_reordered = 0
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
wbs = cell.get("word_boxes") or []
|
|
||||||
if len(wbs) < 2:
|
|
||||||
continue
|
|
||||||
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
|
||||||
sorted_wbs = [w for line in lines for w in line]
|
|
||||||
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
|
|
||||||
cell["word_boxes"] = sorted_wbs
|
|
||||||
wb_reordered += 1
|
|
||||||
if wb_reordered:
|
|
||||||
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
|
||||||
|
|
||||||
|
|
||||||
def _enforce_max_columns(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
max_columns: int,
|
|
||||||
) -> None:
|
|
||||||
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
|
|
||||||
for z in zones_data:
|
|
||||||
if z.get("zone_type") != "content":
|
|
||||||
continue
|
|
||||||
cols = z.get("columns", [])
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
if len(cols) <= max_columns:
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"max_columns=%d: zone %s has %d columns -> merging",
|
|
||||||
max_columns, z.get("zone_index"), len(cols),
|
|
||||||
)
|
|
||||||
|
|
||||||
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
|
|
||||||
|
|
||||||
while len(cols) > max_columns:
|
|
||||||
narrowest = cols_by_width.pop(0)
|
|
||||||
ni = narrowest["index"]
|
|
||||||
|
|
||||||
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
|
|
||||||
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
|
|
||||||
if pos + 1 < len(sorted_by_x):
|
|
||||||
merge_target = sorted_by_x[pos + 1]
|
|
||||||
elif pos > 0:
|
|
||||||
merge_target = sorted_by_x[pos - 1]
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
ti = merge_target["index"]
|
|
||||||
|
|
||||||
merge_target["x_min_px"] = min(
|
|
||||||
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
|
|
||||||
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
|
|
||||||
)
|
|
||||||
merge_target["x_max_px"] = max(
|
|
||||||
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
|
|
||||||
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
|
|
||||||
)
|
|
||||||
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
|
|
||||||
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
|
|
||||||
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
|
|
||||||
|
|
||||||
for cell in cells:
|
|
||||||
if cell.get("col_index") == ni:
|
|
||||||
cell["col_index"] = ti
|
|
||||||
existing = next(
|
|
||||||
(c for c in cells if c["col_index"] == ti
|
|
||||||
and c["row_index"] == cell["row_index"]
|
|
||||||
and c is not cell),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
if existing:
|
|
||||||
existing["text"] = (
|
|
||||||
(existing.get("text", "") + " " + cell.get("text", "")).strip()
|
|
||||||
)
|
|
||||||
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
|
|
||||||
cell["_merged"] = True
|
|
||||||
|
|
||||||
z["cells"] = [c for c in cells if not c.get("_merged")]
|
|
||||||
cells = z["cells"]
|
|
||||||
cols.remove(narrowest)
|
|
||||||
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
|
|
||||||
|
|
||||||
# Re-index columns 0..N-1
|
|
||||||
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
|
|
||||||
old_idx = col["index"]
|
|
||||||
col["index"] = new_idx
|
|
||||||
for cell in cells:
|
|
||||||
if cell.get("col_index") == old_idx:
|
|
||||||
cell["col_index"] = new_idx
|
|
||||||
|
|
||||||
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
|
|
||||||
|
|||||||
@@ -1,390 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/build/cleanup.py
|
||||||
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
|
import importlib as _importlib
|
||||||
divider removal, connector normalization, border strip detection, and
|
import sys as _sys
|
||||||
alphabet sidebar removal.
|
_sys.modules[__name__] = _importlib.import_module("grid.build.cleanup")
|
||||||
|
|
||||||
Extracted from grid_build_core.py for maintainability.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import Any, Dict, List
|
|
||||||
|
|
||||||
from cv_ocr_engines import _words_to_reading_order_text
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_PIPE_RE = re.compile(r"^\|+$")
|
|
||||||
|
|
||||||
|
|
||||||
def _cleanup_zones(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
border_prefiltered: bool,
|
|
||||||
session_id: str,
|
|
||||||
) -> bool:
|
|
||||||
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
zones_data: List of zone dicts (modified in place).
|
|
||||||
border_prefiltered: Whether border words were already pre-filtered.
|
|
||||||
session_id: For logging.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Updated border_prefiltered flag.
|
|
||||||
"""
|
|
||||||
_remove_junk_rows(zones_data)
|
|
||||||
_remove_artifact_cells(zones_data)
|
|
||||||
_remove_oversized_word_boxes(zones_data)
|
|
||||||
_remove_pipe_dividers(zones_data)
|
|
||||||
_normalize_connector_columns(zones_data)
|
|
||||||
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
|
|
||||||
_remove_alphabet_sidebars(zones_data)
|
|
||||||
return border_prefiltered
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove rows where ALL cells contain only short, low-confidence text.
|
|
||||||
|
|
||||||
Also removes 'oversized stub' rows and 'scattered debris' rows.
|
|
||||||
"""
|
|
||||||
_JUNK_CONF_THRESHOLD = 50
|
|
||||||
_JUNK_MAX_TEXT_LEN = 3
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
rows = z.get("rows", [])
|
|
||||||
if not cells or not rows:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Compute median word height across the zone for oversized detection
|
|
||||||
all_wb_heights = [
|
|
||||||
wb["height"]
|
|
||||||
for cell in cells
|
|
||||||
for wb in cell.get("word_boxes") or []
|
|
||||||
if wb.get("height", 0) > 0
|
|
||||||
]
|
|
||||||
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
|
|
||||||
|
|
||||||
junk_row_indices = set()
|
|
||||||
for row in rows:
|
|
||||||
ri = row["index"]
|
|
||||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
||||||
if not row_cells:
|
|
||||||
continue
|
|
||||||
|
|
||||||
row_wbs = [
|
|
||||||
wb for cell in row_cells
|
|
||||||
for wb in cell.get("word_boxes") or []
|
|
||||||
]
|
|
||||||
|
|
||||||
# Rule 1: ALL word_boxes are low-conf AND short text
|
|
||||||
all_junk = True
|
|
||||||
for wb in row_wbs:
|
|
||||||
text = (wb.get("text") or "").strip()
|
|
||||||
conf = wb.get("conf", 0)
|
|
||||||
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
|
||||||
all_junk = False
|
|
||||||
break
|
|
||||||
if all_junk and row_wbs:
|
|
||||||
junk_row_indices.add(ri)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Rule 2: oversized stub -- <=3 words, short total text,
|
|
||||||
# and word height > 1.8x median
|
|
||||||
if len(row_wbs) <= 3:
|
|
||||||
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
|
|
||||||
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
|
|
||||||
has_page_ref = any(
|
|
||||||
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
|
|
||||||
for wb in row_wbs
|
|
||||||
)
|
|
||||||
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
|
|
||||||
junk_row_indices.add(ri)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Rule 3: scattered debris -- rows with only tiny fragments
|
|
||||||
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
|
|
||||||
if longest <= 2:
|
|
||||||
junk_row_indices.add(ri)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if junk_row_indices:
|
|
||||||
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
|
||||||
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
|
||||||
logger.info(
|
|
||||||
"build-grid: removed %d junk rows from zone %d: %s",
|
|
||||||
len(junk_row_indices), z["zone_index"],
|
|
||||||
sorted(junk_row_indices),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove individual cells with a single very-short, low-conf word."""
|
|
||||||
_ARTIFACT_MAX_LEN = 2
|
|
||||||
_ARTIFACT_CONF_THRESHOLD = 65
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
if not cells:
|
|
||||||
continue
|
|
||||||
artifact_ids = set()
|
|
||||||
for cell in cells:
|
|
||||||
wbs = cell.get("word_boxes") or []
|
|
||||||
if len(wbs) != 1:
|
|
||||||
continue
|
|
||||||
wb = wbs[0]
|
|
||||||
text = (wb.get("text") or "").strip()
|
|
||||||
conf = wb.get("conf", 100)
|
|
||||||
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
|
|
||||||
artifact_ids.add(cell.get("cell_id"))
|
|
||||||
if artifact_ids:
|
|
||||||
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
|
|
||||||
logger.info(
|
|
||||||
"build-grid: removed %d artifact cells from zone %d: %s",
|
|
||||||
len(artifact_ids), z.get("zone_index", 0),
|
|
||||||
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
|
|
||||||
for z in zones_data:
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
if not cells:
|
|
||||||
continue
|
|
||||||
all_wh = [
|
|
||||||
wb["height"]
|
|
||||||
for cell in cells
|
|
||||||
for wb in cell.get("word_boxes") or []
|
|
||||||
if wb.get("height", 0) > 0
|
|
||||||
]
|
|
||||||
if not all_wh:
|
|
||||||
continue
|
|
||||||
med_h = sorted(all_wh)[len(all_wh) // 2]
|
|
||||||
oversized_threshold = med_h * 3
|
|
||||||
removed_oversized = 0
|
|
||||||
for cell in cells:
|
|
||||||
wbs = cell.get("word_boxes") or []
|
|
||||||
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
|
|
||||||
if len(filtered) < len(wbs):
|
|
||||||
removed_oversized += len(wbs) - len(filtered)
|
|
||||||
cell["word_boxes"] = filtered
|
|
||||||
cell["text"] = _words_to_reading_order_text(filtered)
|
|
||||||
if removed_oversized:
|
|
||||||
z["cells"] = [c for c in cells if c.get("word_boxes")]
|
|
||||||
logger.info(
|
|
||||||
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
|
|
||||||
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove pipe-character word_boxes (column divider artifacts)."""
|
|
||||||
for z in zones_data:
|
|
||||||
if z.get("vsplit_group") is not None:
|
|
||||||
continue # pipes already removed before split
|
|
||||||
removed_pipes = 0
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
wbs = cell.get("word_boxes") or []
|
|
||||||
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
|
||||||
if len(filtered) < len(wbs):
|
|
||||||
removed_pipes += len(wbs) - len(filtered)
|
|
||||||
cell["word_boxes"] = filtered
|
|
||||||
cell["text"] = _words_to_reading_order_text(filtered)
|
|
||||||
if removed_pipes:
|
|
||||||
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
|
||||||
logger.info(
|
|
||||||
"build-grid: removed %d pipe-divider word_boxes from zone %d",
|
|
||||||
removed_pipes, z.get("zone_index", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Strip pipe chars ONLY from cell edges (OCR artifacts).
|
|
||||||
# Preserve pipes embedded in words as syllable separators.
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if "|" in text:
|
|
||||||
cleaned = text.strip("|").strip()
|
|
||||||
if cleaned != text.strip():
|
|
||||||
cell["text"] = cleaned
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Normalize narrow connector columns where OCR appends noise chars.
|
|
||||||
|
|
||||||
In synonym dictionaries a narrow column repeats the same word
|
|
||||||
(e.g. "oder") in every row. OCR sometimes appends noise chars.
|
|
||||||
"""
|
|
||||||
for z in zones_data:
|
|
||||||
cols = z.get("columns", [])
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
if not cols or not cells:
|
|
||||||
continue
|
|
||||||
for col in cols:
|
|
||||||
ci = col.get("index")
|
|
||||||
col_cells = [c for c in cells if c.get("col_index") == ci]
|
|
||||||
if len(col_cells) < 3:
|
|
||||||
continue
|
|
||||||
text_counts: Dict[str, int] = {}
|
|
||||||
for c in col_cells:
|
|
||||||
t = (c.get("text") or "").strip()
|
|
||||||
if t:
|
|
||||||
text_counts[t] = text_counts.get(t, 0) + 1
|
|
||||||
if not text_counts:
|
|
||||||
continue
|
|
||||||
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
|
|
||||||
dominant_count = text_counts[dominant_text]
|
|
||||||
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
|
|
||||||
continue
|
|
||||||
fixed = 0
|
|
||||||
for c in col_cells:
|
|
||||||
t = (c.get("text") or "").strip()
|
|
||||||
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
|
|
||||||
c["text"] = dominant_text
|
|
||||||
wbs = c.get("word_boxes") or []
|
|
||||||
if len(wbs) == 1:
|
|
||||||
wbs[0]["text"] = dominant_text
|
|
||||||
fixed += 1
|
|
||||||
if fixed:
|
|
||||||
logger.info(
|
|
||||||
"build-grid: normalized %d outlier cells in connector column %d "
|
|
||||||
"(dominant='%s') zone %d",
|
|
||||||
fixed, ci, dominant_text, z.get("zone_index", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_border_strips(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
border_prefiltered: bool,
|
|
||||||
) -> bool:
|
|
||||||
"""Detect and remove page-border decoration strips.
|
|
||||||
|
|
||||||
Returns updated border_prefiltered flag.
|
|
||||||
"""
|
|
||||||
border_strip_removed = 0
|
|
||||||
if border_prefiltered:
|
|
||||||
logger.info("Step 4e: skipped (border pre-filter already applied)")
|
|
||||||
return border_prefiltered
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
if not cells:
|
|
||||||
continue
|
|
||||||
all_wbs_with_cell: list = []
|
|
||||||
for cell in cells:
|
|
||||||
for wb in cell.get("word_boxes") or []:
|
|
||||||
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
|
|
||||||
if len(all_wbs_with_cell) < 10:
|
|
||||||
continue
|
|
||||||
all_wbs_with_cell.sort(key=lambda t: t[0])
|
|
||||||
total = len(all_wbs_with_cell)
|
|
||||||
|
|
||||||
# -- Left-edge scan --
|
|
||||||
left_strip_count = 0
|
|
||||||
left_gap = 0
|
|
||||||
running_right = 0
|
|
||||||
for gi in range(total - 1):
|
|
||||||
running_right = max(
|
|
||||||
running_right,
|
|
||||||
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
|
|
||||||
)
|
|
||||||
gap = all_wbs_with_cell[gi + 1][0] - running_right
|
|
||||||
if gap > 30:
|
|
||||||
left_strip_count = gi + 1
|
|
||||||
left_gap = gap
|
|
||||||
break
|
|
||||||
|
|
||||||
# -- Right-edge scan --
|
|
||||||
right_strip_count = 0
|
|
||||||
right_gap = 0
|
|
||||||
running_left = all_wbs_with_cell[-1][0]
|
|
||||||
for gi in range(total - 1, 0, -1):
|
|
||||||
running_left = min(running_left, all_wbs_with_cell[gi][0])
|
|
||||||
prev_right = (
|
|
||||||
all_wbs_with_cell[gi - 1][0]
|
|
||||||
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
|
|
||||||
)
|
|
||||||
gap = running_left - prev_right
|
|
||||||
if gap > 30:
|
|
||||||
right_strip_count = total - gi
|
|
||||||
right_gap = gap
|
|
||||||
break
|
|
||||||
|
|
||||||
strip_wbs: set = set()
|
|
||||||
strip_side = ""
|
|
||||||
strip_gap = 0
|
|
||||||
strip_count = 0
|
|
||||||
if left_strip_count > 0 and left_strip_count / total < 0.20:
|
|
||||||
strip_side = "left"
|
|
||||||
strip_count = left_strip_count
|
|
||||||
strip_gap = left_gap
|
|
||||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
|
|
||||||
elif right_strip_count > 0 and right_strip_count / total < 0.20:
|
|
||||||
strip_side = "right"
|
|
||||||
strip_count = right_strip_count
|
|
||||||
strip_gap = right_gap
|
|
||||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
|
|
||||||
|
|
||||||
if not strip_wbs:
|
|
||||||
continue
|
|
||||||
for cell in cells:
|
|
||||||
wbs = cell.get("word_boxes") or []
|
|
||||||
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
|
|
||||||
if len(filtered) < len(wbs):
|
|
||||||
border_strip_removed += len(wbs) - len(filtered)
|
|
||||||
cell["word_boxes"] = filtered
|
|
||||||
cell["text"] = _words_to_reading_order_text(filtered)
|
|
||||||
z["cells"] = [c for c in cells
|
|
||||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
|
||||||
logger.info(
|
|
||||||
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
|
|
||||||
"(gap=%dpx, strip=%d/%d wbs)",
|
|
||||||
border_strip_removed, strip_side, z.get("zone_index", 0),
|
|
||||||
strip_gap, strip_count, total,
|
|
||||||
)
|
|
||||||
|
|
||||||
return border_prefiltered
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove decorative edge columns (alphabet sidebar safety net).
|
|
||||||
|
|
||||||
Dictionary pages have A-Z letter sidebars that OCR reads as single-
|
|
||||||
character word_boxes.
|
|
||||||
"""
|
|
||||||
for z in zones_data:
|
|
||||||
columns = z.get("columns", [])
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
if len(columns) < 3 or not cells:
|
|
||||||
continue
|
|
||||||
col_cells: Dict[str, List[Dict]] = {}
|
|
||||||
for cell in cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if ct.startswith("column_"):
|
|
||||||
col_cells.setdefault(ct, []).append(cell)
|
|
||||||
col_types_ordered = sorted(col_cells.keys())
|
|
||||||
if len(col_types_ordered) < 3:
|
|
||||||
continue
|
|
||||||
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
|
||||||
edge_cells_list = col_cells.get(edge_ct, [])
|
|
||||||
if len(edge_cells_list) < 3:
|
|
||||||
continue
|
|
||||||
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
|
|
||||||
avg_len = sum(len(t) for t in texts) / len(texts)
|
|
||||||
single_char = sum(1 for t in texts if len(t) <= 1)
|
|
||||||
single_ratio = single_char / len(texts)
|
|
||||||
if avg_len > 1.5:
|
|
||||||
continue
|
|
||||||
if single_ratio < 0.7:
|
|
||||||
continue
|
|
||||||
removed_count = len(edge_cells_list)
|
|
||||||
edge_ids = {id(c) for c in edge_cells_list}
|
|
||||||
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
|
||||||
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
|
||||||
logger.info(
|
|
||||||
"Step 4f: removed decorative edge column '%s' from zone %d "
|
|
||||||
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
|
|
||||||
edge_ct, z.get("zone_index", 0), removed_count,
|
|
||||||
avg_len, single_ratio * 100,
|
|
||||||
)
|
|
||||||
break # only remove one edge per zone
|
|
||||||
|
|||||||
@@ -1,213 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/build/core.py
|
||||||
Grid Build Core — the main _build_grid_core() function.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Extracted from grid_editor_api.py for maintainability.
|
_sys.modules[__name__] = _importlib.import_module("grid.build.core")
|
||||||
Takes merged OCR word positions and builds a structured, zone-aware grid.
|
|
||||||
|
|
||||||
The function delegates to phase-specific modules:
|
|
||||||
- grid_build_zones.py — image loading, graphic/box detection, zone grids
|
|
||||||
- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
|
|
||||||
- grid_build_text_ops.py — color, headings, IPA, page refs
|
|
||||||
- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from grid_editor_helpers import (
|
|
||||||
_flatten_word_boxes,
|
|
||||||
_get_content_bounds,
|
|
||||||
_filter_decorative_margin,
|
|
||||||
_filter_footer_words,
|
|
||||||
_filter_header_junk,
|
|
||||||
)
|
|
||||||
|
|
||||||
from grid_build_zones import _build_zones
|
|
||||||
from grid_build_cleanup import _cleanup_zones
|
|
||||||
from grid_build_text_ops import _process_text
|
|
||||||
from grid_build_finalize import _finalize_grid
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
async def _build_grid_core(
|
|
||||||
session_id: str,
|
|
||||||
session: dict,
|
|
||||||
*,
|
|
||||||
ipa_mode: str = "auto",
|
|
||||||
syllable_mode: str = "auto",
|
|
||||||
enhance: bool = True,
|
|
||||||
max_columns: Optional[int] = None,
|
|
||||||
min_conf: Optional[int] = None,
|
|
||||||
) -> dict:
|
|
||||||
"""Core grid building logic — pure computation, no HTTP or DB side effects.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session_id: Session identifier (for logging and image loading).
|
|
||||||
session: Full session dict from get_session_db().
|
|
||||||
ipa_mode: "auto" (only when English headwords detected), "all"
|
|
||||||
(force IPA on all content columns), "en" (English column only),
|
|
||||||
"de" (German/definition columns only), or "none" (skip entirely).
|
|
||||||
syllable_mode: "auto" (only when original has pipe dividers),
|
|
||||||
"all" (force syllabification on all words), "en" (English only),
|
|
||||||
"de" (German only), or "none" (skip).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
StructuredGrid result dict.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If session data is incomplete.
|
|
||||||
"""
|
|
||||||
t0 = time.time()
|
|
||||||
|
|
||||||
# ── Phase 1: Input Validation & Word Filtering ──────────────────
|
|
||||||
|
|
||||||
# 1. Validate and load word results
|
|
||||||
word_result = session.get("word_result")
|
|
||||||
if not word_result or not word_result.get("cells"):
|
|
||||||
raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
|
|
||||||
|
|
||||||
img_w = word_result.get("image_width", 0)
|
|
||||||
img_h = word_result.get("image_height", 0)
|
|
||||||
if not img_w or not img_h:
|
|
||||||
raise ValueError("Missing image dimensions in word_result")
|
|
||||||
|
|
||||||
# 2. Flatten all word boxes from cells
|
|
||||||
all_words = _flatten_word_boxes(word_result["cells"])
|
|
||||||
if not all_words:
|
|
||||||
raise ValueError("No word boxes found in cells")
|
|
||||||
|
|
||||||
# 2a-pre. Apply min_conf filter if specified
|
|
||||||
if min_conf and min_conf > 0:
|
|
||||||
before = len(all_words)
|
|
||||||
all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
|
|
||||||
removed = before - len(all_words)
|
|
||||||
if removed:
|
|
||||||
logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
|
|
||||||
session_id, min_conf, removed, before)
|
|
||||||
|
|
||||||
logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
|
|
||||||
session_id, len(all_words), len(word_result["cells"]),
|
|
||||||
enhance, max_columns, min_conf)
|
|
||||||
|
|
||||||
# 2b. Filter decorative margin columns (alphabet graphics)
|
|
||||||
margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
|
|
||||||
margin_strip_detected = margin_strip_info.get("found", False)
|
|
||||||
|
|
||||||
# Read document_category from session
|
|
||||||
document_category = session.get("document_category")
|
|
||||||
|
|
||||||
# 2c. Filter footer rows (page numbers at the very bottom)
|
|
||||||
page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
|
|
||||||
|
|
||||||
# 2c2. Filter OCR junk from header illustrations
|
|
||||||
_filter_header_junk(all_words, img_h, logger, session_id)
|
|
||||||
|
|
||||||
# 2d. Filter words inside user-defined exclude regions
|
|
||||||
structure_result = session.get("structure_result")
|
|
||||||
exclude_rects = []
|
|
||||||
if structure_result:
|
|
||||||
for er in structure_result.get("exclude_regions", []):
|
|
||||||
exclude_rects.append({
|
|
||||||
"x": er["x"], "y": er["y"],
|
|
||||||
"w": er["w"], "h": er["h"],
|
|
||||||
})
|
|
||||||
if exclude_rects:
|
|
||||||
before = len(all_words)
|
|
||||||
filtered = []
|
|
||||||
for w in all_words:
|
|
||||||
w_cx = w["left"] + w.get("width", 0) / 2
|
|
||||||
w_cy = w["top"] + w.get("height", 0) / 2
|
|
||||||
inside = any(
|
|
||||||
er["x"] <= w_cx <= er["x"] + er["w"]
|
|
||||||
and er["y"] <= w_cy <= er["y"] + er["h"]
|
|
||||||
for er in exclude_rects
|
|
||||||
)
|
|
||||||
if not inside:
|
|
||||||
filtered.append(w)
|
|
||||||
removed = before - len(filtered)
|
|
||||||
if removed:
|
|
||||||
all_words = filtered
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: removed %d words inside %d user exclude region(s)",
|
|
||||||
session_id, removed, len(exclude_rects),
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2e. Hard-filter words inside graphic/image regions from structure step
|
|
||||||
graphic_rects: List[Dict[str, int]] = []
|
|
||||||
if structure_result:
|
|
||||||
for g in structure_result.get("graphics", []):
|
|
||||||
graphic_rects.append({
|
|
||||||
"x": g["x"], "y": g["y"],
|
|
||||||
"w": g["w"], "h": g["h"],
|
|
||||||
})
|
|
||||||
if graphic_rects:
|
|
||||||
before = len(all_words)
|
|
||||||
all_words = [
|
|
||||||
w for w in all_words
|
|
||||||
if not any(
|
|
||||||
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
|
||||||
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
|
||||||
for gr in graphic_rects
|
|
||||||
)
|
|
||||||
]
|
|
||||||
removed = before - len(all_words)
|
|
||||||
if removed:
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
|
|
||||||
session_id, removed, len(graphic_rects),
|
|
||||||
)
|
|
||||||
|
|
||||||
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
|
|
||||||
|
|
||||||
# ── Phase 2: Image Processing & Zone Detection ──────────────────
|
|
||||||
|
|
||||||
zone_result = await _build_zones(
|
|
||||||
session_id, session, all_words, graphic_rects,
|
|
||||||
content_x, content_y, content_w, content_h,
|
|
||||||
img_w, img_h,
|
|
||||||
)
|
|
||||||
zones_data = zone_result["zones_data"]
|
|
||||||
boxes_detected = zone_result["boxes_detected"]
|
|
||||||
recovered_count = zone_result["recovered_count"]
|
|
||||||
border_prefiltered = zone_result["border_prefiltered"]
|
|
||||||
img_bgr = zone_result["img_bgr"]
|
|
||||||
|
|
||||||
# ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
|
|
||||||
|
|
||||||
border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
|
|
||||||
|
|
||||||
# ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
|
|
||||||
|
|
||||||
text_result = _process_text(
|
|
||||||
zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
|
|
||||||
|
|
||||||
duration = time.time() - t0
|
|
||||||
|
|
||||||
result = _finalize_grid(
|
|
||||||
zones_data=zones_data,
|
|
||||||
all_words=all_words,
|
|
||||||
img_bgr=img_bgr,
|
|
||||||
img_w=img_w,
|
|
||||||
img_h=img_h,
|
|
||||||
session_id=session_id,
|
|
||||||
max_columns=max_columns,
|
|
||||||
ipa_mode=ipa_mode,
|
|
||||||
syllable_mode=syllable_mode,
|
|
||||||
en_col_type=text_result["en_col_type"],
|
|
||||||
ipa_target_cols=text_result["ipa_target_cols"],
|
|
||||||
all_content_cols=text_result["all_content_cols"],
|
|
||||||
skip_ipa=text_result["skip_ipa"],
|
|
||||||
document_category=document_category,
|
|
||||||
margin_strip_detected=margin_strip_detected,
|
|
||||||
page_number_info=text_result["page_number_info"],
|
|
||||||
boxes_detected=boxes_detected,
|
|
||||||
recovered_count=recovered_count,
|
|
||||||
duration=duration,
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
@@ -1,452 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/build/finalize.py
|
||||||
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
|
import importlib as _importlib
|
||||||
dictionary detection, syllable dividers, spell checking, empty column
|
import sys as _sys
|
||||||
removal, and result assembly.
|
_sys.modules[__name__] = _importlib.import_module("grid.build.finalize")
|
||||||
|
|
||||||
Extracted from grid_build_core.py for maintainability.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from grid_build_cell_ops import (
|
|
||||||
_remove_bullets_and_artifacts,
|
|
||||||
_remove_garbled_cells,
|
|
||||||
_normalize_word_order,
|
|
||||||
_enforce_max_columns,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _finalize_grid(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
all_words: List[Dict[str, Any]],
|
|
||||||
img_bgr: Any,
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
session_id: str,
|
|
||||||
max_columns: Optional[int],
|
|
||||||
ipa_mode: str,
|
|
||||||
syllable_mode: str,
|
|
||||||
en_col_type: Optional[str],
|
|
||||||
ipa_target_cols: set,
|
|
||||||
all_content_cols: set,
|
|
||||||
skip_ipa: bool,
|
|
||||||
document_category: Optional[str],
|
|
||||||
margin_strip_detected: bool,
|
|
||||||
page_number_info: Optional[Dict],
|
|
||||||
boxes_detected: int,
|
|
||||||
recovered_count: int,
|
|
||||||
duration: float,
|
|
||||||
) -> dict:
|
|
||||||
"""Run final processing steps and assemble result dict.
|
|
||||||
|
|
||||||
Handles: bullet removal, artifact cells, word ordering, max_columns,
|
|
||||||
dictionary detection, syllable dividers, spell check, empty columns,
|
|
||||||
internal flag cleanup, and result assembly.
|
|
||||||
"""
|
|
||||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
|
||||||
|
|
||||||
# 5i. Remove blue bullet/artifact word_boxes
|
|
||||||
_remove_bullets_and_artifacts(zones_data)
|
|
||||||
|
|
||||||
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
|
|
||||||
_remove_garbled_cells(zones_data)
|
|
||||||
|
|
||||||
# 5j. Normalise word_box order to reading order
|
|
||||||
_normalize_word_order(zones_data)
|
|
||||||
|
|
||||||
# 5k. Enforce max_columns by merging narrowest columns
|
|
||||||
if max_columns and max_columns > 0:
|
|
||||||
_enforce_max_columns(zones_data, max_columns)
|
|
||||||
|
|
||||||
# --- Dictionary detection on assembled grid ---
|
|
||||||
dict_detection = _detect_dictionary(
|
|
||||||
zones_data, img_w, img_h, document_category, margin_strip_detected
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Word-gap merge ---
|
|
||||||
try:
|
|
||||||
from cv_syllable_detect import merge_word_gaps_in_zones
|
|
||||||
merge_word_gaps_in_zones(zones_data, session_id)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Word-gap merge failed: %s", e)
|
|
||||||
|
|
||||||
# --- Pipe auto-correction ---
|
|
||||||
try:
|
|
||||||
from cv_syllable_detect import autocorrect_pipe_artifacts
|
|
||||||
autocorrect_pipe_artifacts(zones_data, session_id)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Pipe autocorrect failed: %s", e)
|
|
||||||
|
|
||||||
# --- Syllable divider insertion ---
|
|
||||||
syllable_insertions = _insert_syllable_dividers(
|
|
||||||
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
|
|
||||||
en_col_type, all_content_cols, total_cols,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Split merged words ---
|
|
||||||
_split_merged_words(zones_data, session_id)
|
|
||||||
|
|
||||||
# --- Ensure space before IPA/phonetic brackets ---
|
|
||||||
_fix_ipa_spacing(zones_data)
|
|
||||||
|
|
||||||
# --- SmartSpellChecker ---
|
|
||||||
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
|
|
||||||
|
|
||||||
# --- Debug log cell counts per column ---
|
|
||||||
for z in zones_data:
|
|
||||||
if z.get("zone_type") == "content":
|
|
||||||
from collections import Counter as _Counter
|
|
||||||
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
|
|
||||||
_cols = z.get("columns", [])
|
|
||||||
logger.info(
|
|
||||||
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
|
|
||||||
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Remove empty columns ---
|
|
||||||
_remove_empty_columns(zones_data)
|
|
||||||
|
|
||||||
# Clean up internal flags before returning
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
cell.pop("_ipa_corrected", None)
|
|
||||||
|
|
||||||
# 6. Build result
|
|
||||||
return _assemble_result(
|
|
||||||
zones_data, all_words, img_w, img_h, session_id,
|
|
||||||
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
|
|
||||||
dict_detection, page_number_info, boxes_detected,
|
|
||||||
recovered_count, duration, syllable_insertions,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_dictionary(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
document_category: Optional[str],
|
|
||||||
margin_strip_detected: bool,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Run dictionary detection on the assembled grid."""
|
|
||||||
from cv_layout import _score_dictionary_signals
|
|
||||||
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
|
|
||||||
try:
|
|
||||||
from cv_vocab_types import ColumnGeometry
|
|
||||||
for z in zones_data:
|
|
||||||
zone_cells = z.get("cells", [])
|
|
||||||
zone_cols = z.get("columns", [])
|
|
||||||
if len(zone_cols) < 2 or len(zone_cells) < 10:
|
|
||||||
continue
|
|
||||||
pseudo_geoms = []
|
|
||||||
for col in zone_cols:
|
|
||||||
ci = col["index"]
|
|
||||||
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
|
|
||||||
col_words = []
|
|
||||||
for cell in col_cells:
|
|
||||||
for wb in cell.get("word_boxes") or []:
|
|
||||||
col_words.append({
|
|
||||||
"text": wb.get("text", ""),
|
|
||||||
"conf": wb.get("conf", 0),
|
|
||||||
"top": wb.get("top", 0),
|
|
||||||
"left": wb.get("left", 0),
|
|
||||||
"height": wb.get("height", 0),
|
|
||||||
"width": wb.get("width", 0),
|
|
||||||
})
|
|
||||||
if not cell.get("word_boxes") and cell.get("text"):
|
|
||||||
col_words.append({
|
|
||||||
"text": cell["text"],
|
|
||||||
"conf": cell.get("confidence", 50),
|
|
||||||
"top": cell.get("bbox_px", {}).get("y", 0),
|
|
||||||
"left": cell.get("bbox_px", {}).get("x", 0),
|
|
||||||
"height": cell.get("bbox_px", {}).get("h", 20),
|
|
||||||
"width": cell.get("bbox_px", {}).get("w", 50),
|
|
||||||
})
|
|
||||||
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
|
|
||||||
pseudo_geoms.append(ColumnGeometry(
|
|
||||||
index=ci, x=col.get("x_min_px", 0), y=0,
|
|
||||||
width=max(col_w, 1), height=img_h,
|
|
||||||
word_count=len(col_words), words=col_words,
|
|
||||||
width_ratio=col_w / max(img_w, 1),
|
|
||||||
))
|
|
||||||
if len(pseudo_geoms) >= 2:
|
|
||||||
dd = _score_dictionary_signals(
|
|
||||||
pseudo_geoms,
|
|
||||||
document_category=document_category,
|
|
||||||
margin_strip_detected=margin_strip_detected,
|
|
||||||
)
|
|
||||||
if dd["confidence"] > dict_detection["confidence"]:
|
|
||||||
dict_detection = dd
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Dictionary detection failed: %s", e)
|
|
||||||
return dict_detection
|
|
||||||
|
|
||||||
|
|
||||||
def _insert_syllable_dividers(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
img_bgr: Any,
|
|
||||||
session_id: str,
|
|
||||||
syllable_mode: str,
|
|
||||||
dict_detection: Dict[str, Any],
|
|
||||||
en_col_type: Optional[str],
|
|
||||||
all_content_cols: set,
|
|
||||||
total_cols: int,
|
|
||||||
) -> int:
|
|
||||||
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
|
|
||||||
syllable_insertions = 0
|
|
||||||
if syllable_mode == "none" or img_bgr is None:
|
|
||||||
if syllable_mode == "none":
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
t = cell.get("text", "")
|
|
||||||
if "|" in t:
|
|
||||||
cell["text"] = t.replace("|", "")
|
|
||||||
return syllable_insertions
|
|
||||||
|
|
||||||
_syllable_eligible = False
|
|
||||||
if syllable_mode in ("all", "de", "en"):
|
|
||||||
_syllable_eligible = True
|
|
||||||
elif (dict_detection.get("is_dictionary")
|
|
||||||
and dict_detection.get("article_col_index") is not None):
|
|
||||||
_syllable_eligible = True
|
|
||||||
|
|
||||||
_syllable_col_filter: Optional[set] = None
|
|
||||||
if syllable_mode == "en":
|
|
||||||
_syllable_col_filter = {en_col_type} if en_col_type else set()
|
|
||||||
elif syllable_mode == "de":
|
|
||||||
if en_col_type and total_cols >= 3:
|
|
||||||
_syllable_col_filter = all_content_cols - {en_col_type}
|
|
||||||
|
|
||||||
if _syllable_eligible:
|
|
||||||
try:
|
|
||||||
from cv_syllable_detect import insert_syllable_dividers
|
|
||||||
force_syllables = (syllable_mode in ("all", "de", "en"))
|
|
||||||
syllable_insertions = insert_syllable_dividers(
|
|
||||||
zones_data, img_bgr, session_id,
|
|
||||||
force=force_syllables,
|
|
||||||
col_filter=_syllable_col_filter,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Syllable insertion failed: %s", e)
|
|
||||||
|
|
||||||
return syllable_insertions
|
|
||||||
|
|
||||||
|
|
||||||
def _split_merged_words(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
session_id: str,
|
|
||||||
) -> None:
|
|
||||||
"""Split merged words using dictionary lookup."""
|
|
||||||
try:
|
|
||||||
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
|
|
||||||
if not _SPELL_AVAILABLE:
|
|
||||||
return
|
|
||||||
split_count = 0
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
parts = []
|
|
||||||
changed = False
|
|
||||||
for token in text.split():
|
|
||||||
clean = token
|
|
||||||
bracket_pos = clean.find('[')
|
|
||||||
suffix_ipa = ""
|
|
||||||
if bracket_pos > 0:
|
|
||||||
suffix_ipa = clean[bracket_pos:]
|
|
||||||
clean = clean[:bracket_pos]
|
|
||||||
suffix_punct = ""
|
|
||||||
stripped = clean.rstrip(".,!?;:'\")")
|
|
||||||
if stripped != clean:
|
|
||||||
suffix_punct = clean[len(stripped):]
|
|
||||||
clean = stripped
|
|
||||||
suffix = suffix_punct + suffix_ipa
|
|
||||||
contraction = ""
|
|
||||||
if "'" in clean and clean.index("'") >= 2:
|
|
||||||
apos_pos = clean.index("'")
|
|
||||||
contraction = clean[apos_pos:]
|
|
||||||
clean = clean[:apos_pos]
|
|
||||||
suffix = contraction + suffix
|
|
||||||
if len(clean) >= 4 and clean.isalpha():
|
|
||||||
split = _try_split_merged_word(clean)
|
|
||||||
if split:
|
|
||||||
parts.append(split + suffix)
|
|
||||||
changed = True
|
|
||||||
continue
|
|
||||||
parts.append(token)
|
|
||||||
if changed:
|
|
||||||
cell["text"] = " ".join(parts)
|
|
||||||
split_count += 1
|
|
||||||
if split_count:
|
|
||||||
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
|
|
||||||
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if text and "[" in text:
|
|
||||||
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
|
|
||||||
if fixed != text:
|
|
||||||
cell["text"] = fixed
|
|
||||||
|
|
||||||
|
|
||||||
def _run_spell_checker(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
session_id: str,
|
|
||||||
en_col_type: Optional[str],
|
|
||||||
total_cols: int,
|
|
||||||
) -> None:
|
|
||||||
"""Run SmartSpellChecker on all cells."""
|
|
||||||
try:
|
|
||||||
from smart_spell import SmartSpellChecker
|
|
||||||
_ssc = SmartSpellChecker()
|
|
||||||
spell_fix_count = 0
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if not text or not text.strip():
|
|
||||||
continue
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if not ct.startswith("column_"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if total_cols >= 3 and en_col_type:
|
|
||||||
lang = "en" if ct == en_col_type else "de"
|
|
||||||
elif total_cols <= 2:
|
|
||||||
lang = "auto"
|
|
||||||
else:
|
|
||||||
lang = "auto"
|
|
||||||
|
|
||||||
result = _ssc.correct_text(text, lang=lang)
|
|
||||||
if result.changed:
|
|
||||||
cell["text"] = result.corrected
|
|
||||||
spell_fix_count += 1
|
|
||||||
|
|
||||||
if spell_fix_count:
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: SmartSpellChecker fixed %d cells",
|
|
||||||
session_id, spell_fix_count,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
logger.debug("SmartSpellChecker not available in build-grid")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("SmartSpellChecker error in build-grid: %s", e)
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
|
|
||||||
"""Remove columns that have no cells assigned."""
|
|
||||||
for z in zones_data:
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
used_col_indices = {c.get("col_index") for c in cells}
|
|
||||||
old_cols = z.get("columns", [])
|
|
||||||
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
|
|
||||||
if len(new_cols) < len(old_cols):
|
|
||||||
old_to_new = {}
|
|
||||||
for new_i, col in enumerate(new_cols):
|
|
||||||
old_i = col.get("col_index", col.get("index", new_i))
|
|
||||||
old_to_new[old_i] = new_i
|
|
||||||
col["col_index"] = new_i
|
|
||||||
col["index"] = new_i
|
|
||||||
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
|
|
||||||
for cell in cells:
|
|
||||||
old_ci = cell.get("col_index", 0)
|
|
||||||
cell["col_index"] = old_to_new.get(old_ci, old_ci)
|
|
||||||
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
|
|
||||||
z["columns"] = new_cols
|
|
||||||
|
|
||||||
|
|
||||||
def _assemble_result(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
all_words: List[Dict[str, Any]],
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
session_id: str,
|
|
||||||
ipa_mode: str,
|
|
||||||
syllable_mode: str,
|
|
||||||
ipa_target_cols: set,
|
|
||||||
skip_ipa: bool,
|
|
||||||
dict_detection: Dict[str, Any],
|
|
||||||
page_number_info: Optional[Dict],
|
|
||||||
boxes_detected: int,
|
|
||||||
recovered_count: int,
|
|
||||||
duration: float,
|
|
||||||
syllable_insertions: int,
|
|
||||||
) -> dict:
|
|
||||||
"""Build the final result dict (Phase 6)."""
|
|
||||||
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
|
|
||||||
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
|
|
||||||
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
|
|
||||||
|
|
||||||
# Collect color statistics
|
|
||||||
color_stats: Dict[str, int] = {}
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
for wb in cell.get("word_boxes", []):
|
|
||||||
cn = wb.get("color_name", "black")
|
|
||||||
color_stats[cn] = color_stats.get(cn, 0) + 1
|
|
||||||
|
|
||||||
# Compute layout metrics
|
|
||||||
all_content_row_heights: List[float] = []
|
|
||||||
for z in zones_data:
|
|
||||||
for row in z.get("rows", []):
|
|
||||||
if not row.get("is_header", False):
|
|
||||||
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
|
|
||||||
if h > 0:
|
|
||||||
all_content_row_heights.append(h)
|
|
||||||
avg_row_height = (
|
|
||||||
sum(all_content_row_heights) / len(all_content_row_heights)
|
|
||||||
if all_content_row_heights else 30.0
|
|
||||||
)
|
|
||||||
font_size_suggestion = max(10, int(avg_row_height * 0.6))
|
|
||||||
|
|
||||||
return {
|
|
||||||
"session_id": session_id,
|
|
||||||
"image_width": img_w,
|
|
||||||
"image_height": img_h,
|
|
||||||
"zones": zones_data,
|
|
||||||
"boxes_detected": boxes_detected,
|
|
||||||
"summary": {
|
|
||||||
"total_zones": len(zones_data),
|
|
||||||
"total_columns": total_columns,
|
|
||||||
"total_rows": total_rows,
|
|
||||||
"total_cells": total_cells,
|
|
||||||
"total_words": len(all_words),
|
|
||||||
"recovered_colored": recovered_count,
|
|
||||||
"color_stats": color_stats,
|
|
||||||
},
|
|
||||||
"formatting": {
|
|
||||||
"bold_columns": [],
|
|
||||||
"header_rows": [],
|
|
||||||
},
|
|
||||||
"layout_metrics": {
|
|
||||||
"page_width_px": img_w,
|
|
||||||
"page_height_px": img_h,
|
|
||||||
"avg_row_height_px": round(avg_row_height, 1),
|
|
||||||
"font_size_suggestion_px": font_size_suggestion,
|
|
||||||
},
|
|
||||||
"dictionary_detection": {
|
|
||||||
"is_dictionary": dict_detection.get("is_dictionary", False),
|
|
||||||
"confidence": dict_detection.get("confidence", 0.0),
|
|
||||||
"signals": dict_detection.get("signals", {}),
|
|
||||||
"article_col_index": dict_detection.get("article_col_index"),
|
|
||||||
"headword_col_index": dict_detection.get("headword_col_index"),
|
|
||||||
},
|
|
||||||
"processing_modes": {
|
|
||||||
"ipa_mode": ipa_mode,
|
|
||||||
"syllable_mode": syllable_mode,
|
|
||||||
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
|
||||||
"syllables_applied": syllable_insertions > 0,
|
|
||||||
},
|
|
||||||
"page_number": page_number_info,
|
|
||||||
"duration_seconds": round(duration, 2),
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,489 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/build/text_ops.py
|
||||||
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
|
import importlib as _importlib
|
||||||
parenthesis fix, IPA phonetic correction, page ref extraction, and
|
import sys as _sys
|
||||||
slash-IPA conversion.
|
_sys.modules[__name__] = _importlib.import_module("grid.build.text_ops")
|
||||||
|
|
||||||
Extracted from grid_build_core.py for maintainability.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
||||||
|
|
||||||
from cv_color_detect import detect_word_colors
|
|
||||||
from cv_ocr_engines import (
|
|
||||||
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
|
|
||||||
_lookup_ipa,
|
|
||||||
)
|
|
||||||
from grid_editor_helpers import (
|
|
||||||
_detect_heading_rows_by_color,
|
|
||||||
_detect_heading_rows_by_single_cell,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _process_text(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
img_bgr: Any,
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
ipa_mode: str,
|
|
||||||
page_number_info: Optional[Dict],
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Run color annotation, heading detection, IPA correction, and page refs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
zones_data: List of zone dicts (modified in place).
|
|
||||||
img_bgr: BGR image array (or None).
|
|
||||||
img_w: Image width.
|
|
||||||
img_h: Image height.
|
|
||||||
ipa_mode: IPA processing mode.
|
|
||||||
page_number_info: Existing page number metadata (may be None).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
|
|
||||||
skip_ipa, page_number_info.
|
|
||||||
"""
|
|
||||||
# 5. Color annotation on final word_boxes in cells
|
|
||||||
if img_bgr is not None:
|
|
||||||
all_wb: List[Dict] = []
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
all_wb.extend(cell.get("word_boxes", []))
|
|
||||||
detect_word_colors(img_bgr, all_wb)
|
|
||||||
|
|
||||||
# 5a. Heading detection by color + height
|
|
||||||
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
|
|
||||||
if heading_count:
|
|
||||||
logger.info("Detected %d heading rows by color+height", heading_count)
|
|
||||||
|
|
||||||
# 5b. Fix unmatched parentheses in cell text
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if ")" in text and "(" not in text:
|
|
||||||
cell["text"] = "(" + text
|
|
||||||
|
|
||||||
# 5c. IPA phonetic correction
|
|
||||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
|
||||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
|
||||||
en_col_type = None
|
|
||||||
ipa_target_cols: set = set()
|
|
||||||
all_content_cols: set = set()
|
|
||||||
skip_ipa = (ipa_mode == "none")
|
|
||||||
|
|
||||||
# When ipa_mode=none, strip ALL square brackets from ALL content columns
|
|
||||||
if skip_ipa:
|
|
||||||
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
|
|
||||||
for cell in all_cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if not ct.startswith("column_"):
|
|
||||||
continue
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if "[" in text:
|
|
||||||
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
|
|
||||||
if stripped != text:
|
|
||||||
cell["text"] = stripped.strip()
|
|
||||||
cell["_ipa_corrected"] = True
|
|
||||||
|
|
||||||
if not skip_ipa and total_cols >= 3:
|
|
||||||
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
|
|
||||||
all_cells, total_cols, ipa_mode, zones_data
|
|
||||||
)
|
|
||||||
elif not skip_ipa:
|
|
||||||
# Collect all_content_cols even when <3 cols (needed by finalize)
|
|
||||||
for cell in all_cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
|
||||||
all_content_cols.add(ct)
|
|
||||||
|
|
||||||
# 5e. Heading detection by single-cell rows
|
|
||||||
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
|
|
||||||
if single_heading_count:
|
|
||||||
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
|
|
||||||
|
|
||||||
# 5f. Strip IPA from headings
|
|
||||||
for z in zones_data:
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
if cell.get("col_type") != "heading":
|
|
||||||
continue
|
|
||||||
text = cell.get("text", "")
|
|
||||||
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
|
|
||||||
if stripped and stripped != text:
|
|
||||||
cell["text"] = stripped
|
|
||||||
|
|
||||||
# 5g. Extract page_ref cells and footer rows
|
|
||||||
_extract_page_refs_and_footers(zones_data, page_number_info)
|
|
||||||
|
|
||||||
# 5h. Convert slash-delimited IPA to bracket notation
|
|
||||||
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"en_col_type": en_col_type,
|
|
||||||
"ipa_target_cols": ipa_target_cols,
|
|
||||||
"all_content_cols": all_content_cols,
|
|
||||||
"skip_ipa": skip_ipa,
|
|
||||||
"page_number_info": page_number_info,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _run_ipa_correction(
|
|
||||||
all_cells: List[Dict],
|
|
||||||
total_cols: int,
|
|
||||||
ipa_mode: str,
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
) -> Tuple[Optional[str], set, set]:
|
|
||||||
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
|
|
||||||
en_col_type = None
|
|
||||||
all_content_cols: set = set()
|
|
||||||
|
|
||||||
# Detect English headword column via IPA signals
|
|
||||||
col_ipa_count: Dict[str, int] = {}
|
|
||||||
for cell in all_cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if not ct.startswith("column_"):
|
|
||||||
continue
|
|
||||||
txt = cell.get("text", "") or ""
|
|
||||||
if txt.strip():
|
|
||||||
all_content_cols.add(ct)
|
|
||||||
if '[' in txt or _text_has_garbled_ipa(txt):
|
|
||||||
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
|
|
||||||
if col_ipa_count:
|
|
||||||
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
|
|
||||||
elif ipa_mode == "all":
|
|
||||||
col_cell_count: Dict[str, int] = {}
|
|
||||||
for cell in all_cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
|
||||||
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
|
|
||||||
if col_cell_count:
|
|
||||||
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
|
||||||
|
|
||||||
# Decide which columns to process based on ipa_mode
|
|
||||||
en_ipa_target_cols: set = set()
|
|
||||||
de_ipa_target_cols: set = set()
|
|
||||||
if ipa_mode in ("auto", "en"):
|
|
||||||
if en_col_type:
|
|
||||||
en_ipa_target_cols.add(en_col_type)
|
|
||||||
elif ipa_mode == "de":
|
|
||||||
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
|
||||||
elif ipa_mode == "all":
|
|
||||||
if en_col_type:
|
|
||||||
en_ipa_target_cols.add(en_col_type)
|
|
||||||
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
|
|
||||||
|
|
||||||
# --- Strip IPA from columns NOT in the target set ---
|
|
||||||
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
|
|
||||||
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
|
|
||||||
if strip_en_ipa or ipa_mode == "none":
|
|
||||||
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
|
|
||||||
for cell in all_cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if ct not in strip_cols:
|
|
||||||
continue
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if "[" in text:
|
|
||||||
stripped = _SQUARE_BRACKET_RE.sub("", text)
|
|
||||||
if stripped != text:
|
|
||||||
cell["text"] = stripped.strip()
|
|
||||||
cell["_ipa_corrected"] = True
|
|
||||||
|
|
||||||
# --- English IPA (Britfone + eng_to_ipa) ---
|
|
||||||
if en_ipa_target_cols:
|
|
||||||
for cell in all_cells:
|
|
||||||
ct = cell.get("col_type")
|
|
||||||
if ct in en_ipa_target_cols:
|
|
||||||
cell["_orig_col_type"] = ct
|
|
||||||
cell["col_type"] = "column_en"
|
|
||||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
|
||||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
|
||||||
for cell in all_cells:
|
|
||||||
orig = cell.pop("_orig_col_type", None)
|
|
||||||
if orig:
|
|
||||||
cell["col_type"] = orig
|
|
||||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
|
||||||
cell["_ipa_corrected"] = True
|
|
||||||
|
|
||||||
# --- German IPA (wiki-pronunciation-dict + epitran) ---
|
|
||||||
if de_ipa_target_cols:
|
|
||||||
from cv_ipa_german import insert_german_ipa
|
|
||||||
insert_german_ipa(all_cells, de_ipa_target_cols)
|
|
||||||
|
|
||||||
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
|
|
||||||
|
|
||||||
# Mark cells whose text was changed by IPA correction
|
|
||||||
for cell in all_cells:
|
|
||||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
|
||||||
cell["_ipa_corrected"] = True
|
|
||||||
|
|
||||||
# 5d. Fix IPA continuation cells
|
|
||||||
skip_ipa = (ipa_mode == "none")
|
|
||||||
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
|
||||||
ipa_cont_fixed = 0
|
|
||||||
for z in ([] if skip_ipa else zones_data):
|
|
||||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
|
||||||
z_cells = z.get("cells", [])
|
|
||||||
for idx, row in enumerate(rows_sorted):
|
|
||||||
if idx == 0:
|
|
||||||
continue
|
|
||||||
ri = row["index"]
|
|
||||||
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
|
||||||
for cell in row_cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if not ct.startswith("column_"):
|
|
||||||
continue
|
|
||||||
cell_text = (cell.get("text") or "").strip()
|
|
||||||
if not cell_text:
|
|
||||||
wb_texts = [w.get("text", "")
|
|
||||||
for w in cell.get("word_boxes", [])]
|
|
||||||
cell_text = " ".join(wb_texts).strip()
|
|
||||||
if not cell_text:
|
|
||||||
continue
|
|
||||||
|
|
||||||
is_bracketed = (
|
|
||||||
cell_text.startswith('[') and cell_text.endswith(']')
|
|
||||||
)
|
|
||||||
|
|
||||||
if is_bracketed:
|
|
||||||
if not _text_has_garbled_ipa(cell_text):
|
|
||||||
continue
|
|
||||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
content_cells_in_row = [
|
|
||||||
c for c in row_cells
|
|
||||||
if c.get("col_type", "").startswith("column_")
|
|
||||||
and c.get("col_type") != "column_1"
|
|
||||||
]
|
|
||||||
if len(content_cells_in_row) != 1:
|
|
||||||
continue
|
|
||||||
if not _text_has_garbled_ipa(cell_text):
|
|
||||||
continue
|
|
||||||
if any(c in _REAL_IPA_CHARS for c in cell_text):
|
|
||||||
continue
|
|
||||||
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
|
|
||||||
if len(_words_in_text) >= 3:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Find headword in previous row, same column
|
|
||||||
prev_ri = rows_sorted[idx - 1]["index"]
|
|
||||||
prev_same_col = [
|
|
||||||
c for c in z_cells
|
|
||||||
if c.get("row_index") == prev_ri
|
|
||||||
and c.get("col_type") == ct
|
|
||||||
]
|
|
||||||
if not prev_same_col:
|
|
||||||
continue
|
|
||||||
prev_text = prev_same_col[0].get("text", "")
|
|
||||||
fixed = fix_ipa_continuation_cell(
|
|
||||||
cell_text, prev_text, pronunciation="british",
|
|
||||||
)
|
|
||||||
if fixed != cell_text:
|
|
||||||
cell["text"] = fixed
|
|
||||||
ipa_cont_fixed += 1
|
|
||||||
logger.info(
|
|
||||||
"IPA continuation R%d %s: '%s' -> '%s'",
|
|
||||||
ri, ct, cell_text, fixed,
|
|
||||||
)
|
|
||||||
if ipa_cont_fixed:
|
|
||||||
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
|
|
||||||
|
|
||||||
return en_col_type, ipa_target_cols, all_content_cols
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_page_refs_and_footers(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
page_number_info: Optional[Dict],
|
|
||||||
) -> None:
|
|
||||||
"""Extract page_ref cells and footer rows from content zones.
|
|
||||||
|
|
||||||
Modifies zones_data in place. Updates page_number_info if a page number
|
|
||||||
footer is found.
|
|
||||||
"""
|
|
||||||
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
|
||||||
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
|
||||||
_NUMBER_WORDS = {
|
|
||||||
"one", "two", "three", "four", "five", "six", "seven",
|
|
||||||
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
|
||||||
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
|
||||||
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
|
||||||
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
|
||||||
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
|
||||||
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
|
||||||
}
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
if z.get("zone_type") != "content":
|
|
||||||
continue
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
rows = z.get("rows", [])
|
|
||||||
if not rows:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract column_1 cells that look like page references
|
|
||||||
page_refs = []
|
|
||||||
page_ref_cell_ids = set()
|
|
||||||
for cell in cells:
|
|
||||||
if cell.get("col_type") != "column_1":
|
|
||||||
continue
|
|
||||||
text = (cell.get("text") or "").strip()
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
if not _PAGE_REF_RE.match(text):
|
|
||||||
continue
|
|
||||||
page_refs.append({
|
|
||||||
"row_index": cell.get("row_index"),
|
|
||||||
"text": text,
|
|
||||||
"bbox_pct": cell.get("bbox_pct", {}),
|
|
||||||
})
|
|
||||||
page_ref_cell_ids.add(cell.get("cell_id"))
|
|
||||||
|
|
||||||
# Detect footer: last non-header row if it has only 1 cell
|
|
||||||
footer_rows = []
|
|
||||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
|
||||||
if non_header_rows:
|
|
||||||
last_row = non_header_rows[-1]
|
|
||||||
last_ri = last_row["index"]
|
|
||||||
last_cells = [c for c in z["cells"]
|
|
||||||
if c.get("row_index") == last_ri]
|
|
||||||
if len(last_cells) == 1:
|
|
||||||
text = (last_cells[0].get("text") or "").strip()
|
|
||||||
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
|
||||||
has_commas = ',' in text
|
|
||||||
text_words = set(text.lower().split())
|
|
||||||
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
|
||||||
is_page_number = len(text) <= 20 or is_written_number
|
|
||||||
if (text and not has_real_ipa and not has_commas
|
|
||||||
and is_page_number
|
|
||||||
and last_cells[0].get("col_type") != "heading"):
|
|
||||||
footer_rows.append({
|
|
||||||
"row_index": last_ri,
|
|
||||||
"text": text,
|
|
||||||
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
|
||||||
})
|
|
||||||
|
|
||||||
# Classify footer rows
|
|
||||||
page_number_footers = []
|
|
||||||
other_footers = []
|
|
||||||
for fr in footer_rows:
|
|
||||||
ft = fr["text"].strip()
|
|
||||||
digits = "".join(c for c in ft if c.isdigit())
|
|
||||||
if digits and re.match(r'^[\d\s.]+$', ft):
|
|
||||||
page_number_footers.append(fr)
|
|
||||||
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
|
|
||||||
page_number_footers.append(fr)
|
|
||||||
else:
|
|
||||||
other_footers.append(fr)
|
|
||||||
|
|
||||||
# Remove page-number footer rows from grid entirely
|
|
||||||
if page_number_footers:
|
|
||||||
pn_ris = {fr["row_index"] for fr in page_number_footers}
|
|
||||||
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
|
|
||||||
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
|
|
||||||
pn_text = page_number_footers[0]["text"].strip()
|
|
||||||
pn_digits = "".join(c for c in pn_text if c.isdigit())
|
|
||||||
if not page_number_info:
|
|
||||||
page_number_info = {
|
|
||||||
"text": pn_text,
|
|
||||||
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
|
|
||||||
}
|
|
||||||
if pn_digits:
|
|
||||||
page_number_info["number"] = int(pn_digits)
|
|
||||||
|
|
||||||
# Mark remaining footer rows
|
|
||||||
if other_footers:
|
|
||||||
footer_ris = {fr["row_index"] for fr in other_footers}
|
|
||||||
for r in z["rows"]:
|
|
||||||
if r["index"] in footer_ris:
|
|
||||||
r["is_footer"] = True
|
|
||||||
for c in z["cells"]:
|
|
||||||
if c.get("row_index") in footer_ris:
|
|
||||||
c["col_type"] = "footer"
|
|
||||||
|
|
||||||
if page_refs or footer_rows:
|
|
||||||
logger.info(
|
|
||||||
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
|
|
||||||
len(page_refs), len(footer_rows), len(page_number_footers),
|
|
||||||
z.get("zone_index", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
if page_refs:
|
|
||||||
z["page_refs"] = page_refs
|
|
||||||
if other_footers:
|
|
||||||
z["footer"] = other_footers
|
|
||||||
|
|
||||||
|
|
||||||
def _convert_slash_ipa(
|
|
||||||
zones_data: List[Dict[str, Any]],
|
|
||||||
skip_ipa: bool,
|
|
||||||
en_col_type: Optional[str],
|
|
||||||
) -> None:
|
|
||||||
"""Convert slash-delimited IPA to bracket notation.
|
|
||||||
|
|
||||||
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
|
||||||
"""
|
|
||||||
_SLASH_IPA_RE = re.compile(
|
|
||||||
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
|
|
||||||
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
|
|
||||||
)
|
|
||||||
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
|
||||||
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
|
||||||
slash_ipa_fixed = 0
|
|
||||||
|
|
||||||
for z in ([] if skip_ipa else zones_data):
|
|
||||||
for cell in z.get("cells", []):
|
|
||||||
if en_col_type and cell.get("col_type") != en_col_type:
|
|
||||||
continue
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if "/" not in text:
|
|
||||||
continue
|
|
||||||
|
|
||||||
def _replace_slash_ipa(m: re.Match) -> str:
|
|
||||||
nonlocal slash_ipa_fixed
|
|
||||||
headword = m.group(1)
|
|
||||||
ocr_ipa = m.group(2)
|
|
||||||
inner_raw = ocr_ipa.strip("/").strip()
|
|
||||||
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
|
||||||
return m.group(0)
|
|
||||||
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
|
||||||
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
|
||||||
if ipa:
|
|
||||||
slash_ipa_fixed += 1
|
|
||||||
return f"{headword} [{ipa}]"
|
|
||||||
inner = inner_raw.lstrip("'").strip()
|
|
||||||
if inner:
|
|
||||||
slash_ipa_fixed += 1
|
|
||||||
return f"{headword} [{inner}]"
|
|
||||||
return m.group(0)
|
|
||||||
|
|
||||||
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
|
||||||
|
|
||||||
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
|
||||||
|
|
||||||
def _replace_trailing_slash(m: re.Match) -> str:
|
|
||||||
nonlocal slash_ipa_fixed
|
|
||||||
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
|
||||||
if _SLASH_IPA_REJECT_RE.search(inner):
|
|
||||||
return m.group(0)
|
|
||||||
if inner:
|
|
||||||
slash_ipa_fixed += 1
|
|
||||||
return f" [{inner}]"
|
|
||||||
return m.group(0)
|
|
||||||
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
|
|
||||||
|
|
||||||
if new_text == text:
|
|
||||||
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
|
||||||
if m:
|
|
||||||
inner = m.group(1).strip()
|
|
||||||
if not _SLASH_IPA_REJECT_RE.search(inner):
|
|
||||||
inner = inner.lstrip("'").strip()
|
|
||||||
if inner:
|
|
||||||
new_text = "[" + inner + "]" + text[m.end():]
|
|
||||||
slash_ipa_fixed += 1
|
|
||||||
|
|
||||||
if new_text != text:
|
|
||||||
cell["text"] = new_text
|
|
||||||
|
|
||||||
if slash_ipa_fixed:
|
|
||||||
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
|
|
||||||
|
|||||||
@@ -1,462 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/build/zones.py
|
||||||
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
|
import importlib as _importlib
|
||||||
detection and zone-aware grid building.
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("grid.build.zones")
|
||||||
Extracted from grid_build_core.py for maintainability.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
|
||||||
from cv_graphic_detect import detect_graphic_elements
|
|
||||||
from cv_color_detect import recover_colored_text
|
|
||||||
from cv_vocab_types import PageZone
|
|
||||||
from ocr_pipeline_session_store import get_session_image
|
|
||||||
|
|
||||||
from grid_editor_helpers import (
|
|
||||||
_filter_border_strip_words,
|
|
||||||
_filter_border_ghosts,
|
|
||||||
_words_in_zone,
|
|
||||||
_PIPE_RE_VSPLIT,
|
|
||||||
_detect_vertical_dividers,
|
|
||||||
_split_zone_at_vertical_dividers,
|
|
||||||
_merge_content_zones_across_boxes,
|
|
||||||
_build_zone_grid,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
async def _build_zones(
|
|
||||||
session_id: str,
|
|
||||||
session: dict,
|
|
||||||
all_words: List[Dict[str, Any]],
|
|
||||||
graphic_rects: List[Dict[str, int]],
|
|
||||||
content_x: int,
|
|
||||||
content_y: int,
|
|
||||||
content_w: int,
|
|
||||||
content_h: int,
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Load image, detect graphics/boxes, build zone-aware grids.
|
|
||||||
|
|
||||||
Returns a dict with keys:
|
|
||||||
zones_data, boxes_detected, recovered_count, border_prefiltered,
|
|
||||||
img_bgr, all_words (modified in-place but returned for clarity).
|
|
||||||
"""
|
|
||||||
zones_data: List[Dict[str, Any]] = []
|
|
||||||
boxes_detected = 0
|
|
||||||
recovered_count = 0
|
|
||||||
border_prefiltered = False
|
|
||||||
img_bgr = None
|
|
||||||
|
|
||||||
# 3. Load image for box detection
|
|
||||||
img_png = await get_session_image(session_id, "cropped")
|
|
||||||
if not img_png:
|
|
||||||
img_png = await get_session_image(session_id, "dewarped")
|
|
||||||
if not img_png:
|
|
||||||
img_png = await get_session_image(session_id, "original")
|
|
||||||
|
|
||||||
if img_png:
|
|
||||||
# Decode image for color detection + box detection
|
|
||||||
arr = np.frombuffer(img_png, dtype=np.uint8)
|
|
||||||
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
||||||
|
|
||||||
if img_bgr is not None:
|
|
||||||
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
|
|
||||||
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
|
|
||||||
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
|
|
||||||
if fresh_graphics:
|
|
||||||
fresh_rects = [
|
|
||||||
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
|
|
||||||
for g in fresh_graphics
|
|
||||||
]
|
|
||||||
graphic_rects.extend(fresh_rects)
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: detected %d graphic region(s) via CV",
|
|
||||||
session_id, len(fresh_graphics),
|
|
||||||
)
|
|
||||||
# Hard-filter words inside newly detected graphic regions
|
|
||||||
before = len(all_words)
|
|
||||||
all_words[:] = [
|
|
||||||
w for w in all_words
|
|
||||||
if not any(
|
|
||||||
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
|
||||||
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
|
||||||
for gr in fresh_rects
|
|
||||||
)
|
|
||||||
]
|
|
||||||
removed = before - len(all_words)
|
|
||||||
if removed:
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
|
|
||||||
session_id, removed, len(fresh_rects),
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Recover colored text that OCR missed (before grid building) ---
|
|
||||||
recovered = recover_colored_text(img_bgr, all_words)
|
|
||||||
if recovered and graphic_rects:
|
|
||||||
# Filter recovered chars inside graphic regions
|
|
||||||
recovered = [
|
|
||||||
r for r in recovered
|
|
||||||
if not any(
|
|
||||||
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
|
||||||
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
|
||||||
for gr in graphic_rects
|
|
||||||
)
|
|
||||||
]
|
|
||||||
if recovered:
|
|
||||||
recovered_count = len(recovered)
|
|
||||||
all_words.extend(recovered)
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: +%d recovered colored words",
|
|
||||||
session_id, recovered_count,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Detect bordered boxes
|
|
||||||
boxes = detect_boxes(
|
|
||||||
img_bgr,
|
|
||||||
content_x=content_x,
|
|
||||||
content_w=content_w,
|
|
||||||
content_y=content_y,
|
|
||||||
content_h=content_h,
|
|
||||||
)
|
|
||||||
boxes_detected = len(boxes)
|
|
||||||
|
|
||||||
if boxes:
|
|
||||||
# Filter border ghost words before grid building
|
|
||||||
all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
|
|
||||||
if ghost_count:
|
|
||||||
all_words[:] = all_words_new
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: removed %d border ghost words",
|
|
||||||
session_id, ghost_count,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split page into zones
|
|
||||||
page_zones = split_page_into_zones(
|
|
||||||
content_x, content_y, content_w, content_h, boxes
|
|
||||||
)
|
|
||||||
|
|
||||||
# Merge content zones separated by box zones
|
|
||||||
page_zones = _merge_content_zones_across_boxes(
|
|
||||||
page_zones, content_x, content_w
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3b. Detect vertical dividers and split content zones
|
|
||||||
page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
|
|
||||||
page_zones, all_words
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- First pass: build grids per zone independently ---
|
|
||||||
zone_grids = _build_grids_per_zone(
|
|
||||||
page_zones, all_words, img_w, img_h
|
|
||||||
)
|
|
||||||
border_prefiltered = border_prefiltered or any(
|
|
||||||
zg.get("_border_prefiltered") for zg in zone_grids
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Second pass: merge column boundaries from all content zones ---
|
|
||||||
_merge_content_zone_columns(
|
|
||||||
zone_grids, all_words, content_w, img_w, img_h, session_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Build zones_data from zone_grids ---
|
|
||||||
for zg in zone_grids:
|
|
||||||
pz = zg["pz"]
|
|
||||||
grid = zg["grid"]
|
|
||||||
grid.pop("_raw_columns", None)
|
|
||||||
|
|
||||||
zone_entry: Dict[str, Any] = {
|
|
||||||
"zone_index": pz.index,
|
|
||||||
"zone_type": pz.zone_type,
|
|
||||||
"bbox_px": {
|
|
||||||
"x": pz.x, "y": pz.y,
|
|
||||||
"w": pz.width, "h": pz.height,
|
|
||||||
},
|
|
||||||
"bbox_pct": {
|
|
||||||
"x": round(pz.x / img_w * 100, 2) if img_w else 0,
|
|
||||||
"y": round(pz.y / img_h * 100, 2) if img_h else 0,
|
|
||||||
"w": round(pz.width / img_w * 100, 2) if img_w else 0,
|
|
||||||
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
|
|
||||||
},
|
|
||||||
"border": None,
|
|
||||||
"word_count": len(zg["words"]),
|
|
||||||
**grid,
|
|
||||||
}
|
|
||||||
|
|
||||||
if pz.box:
|
|
||||||
zone_entry["border"] = {
|
|
||||||
"thickness": pz.box.border_thickness,
|
|
||||||
"confidence": pz.box.confidence,
|
|
||||||
}
|
|
||||||
|
|
||||||
if pz.image_overlays:
|
|
||||||
zone_entry["image_overlays"] = pz.image_overlays
|
|
||||||
|
|
||||||
if pz.layout_hint:
|
|
||||||
zone_entry["layout_hint"] = pz.layout_hint
|
|
||||||
if pz.vsplit_group is not None:
|
|
||||||
zone_entry["vsplit_group"] = pz.vsplit_group
|
|
||||||
|
|
||||||
zones_data.append(zone_entry)
|
|
||||||
|
|
||||||
# 4. Fallback: no boxes detected -> single zone with all words
|
|
||||||
if not zones_data:
|
|
||||||
before = len(all_words)
|
|
||||||
filtered_words = [
|
|
||||||
w for w in all_words
|
|
||||||
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
|
|
||||||
]
|
|
||||||
removed = before - len(filtered_words)
|
|
||||||
if removed:
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
|
||||||
session_id, removed,
|
|
||||||
)
|
|
||||||
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
|
|
||||||
if bs_removed:
|
|
||||||
border_prefiltered = True
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: pre-filtered %d border-strip words",
|
|
||||||
session_id, bs_removed,
|
|
||||||
)
|
|
||||||
grid = _build_zone_grid(
|
|
||||||
filtered_words, content_x, content_y, content_w, content_h,
|
|
||||||
0, img_w, img_h,
|
|
||||||
)
|
|
||||||
grid.pop("_raw_columns", None)
|
|
||||||
zones_data.append({
|
|
||||||
"zone_index": 0,
|
|
||||||
"zone_type": "content",
|
|
||||||
"bbox_px": {
|
|
||||||
"x": content_x, "y": content_y,
|
|
||||||
"w": content_w, "h": content_h,
|
|
||||||
},
|
|
||||||
"bbox_pct": {
|
|
||||||
"x": round(content_x / img_w * 100, 2) if img_w else 0,
|
|
||||||
"y": round(content_y / img_h * 100, 2) if img_h else 0,
|
|
||||||
"w": round(content_w / img_w * 100, 2) if img_w else 0,
|
|
||||||
"h": round(content_h / img_h * 100, 2) if img_h else 0,
|
|
||||||
},
|
|
||||||
"border": None,
|
|
||||||
"word_count": len(all_words),
|
|
||||||
**grid,
|
|
||||||
})
|
|
||||||
|
|
||||||
return {
|
|
||||||
"zones_data": zones_data,
|
|
||||||
"boxes_detected": boxes_detected,
|
|
||||||
"recovered_count": recovered_count,
|
|
||||||
"border_prefiltered": border_prefiltered,
|
|
||||||
"img_bgr": img_bgr,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_and_split_vertical_dividers(
|
|
||||||
page_zones: List[PageZone],
|
|
||||||
all_words: List[Dict[str, Any]],
|
|
||||||
) -> tuple:
|
|
||||||
"""Detect vertical dividers and split content zones.
|
|
||||||
|
|
||||||
Returns (expanded_zones, border_prefiltered_from_vsplit).
|
|
||||||
"""
|
|
||||||
vsplit_group_counter = 0
|
|
||||||
expanded_zones: List = []
|
|
||||||
for pz in page_zones:
|
|
||||||
if pz.zone_type != "content":
|
|
||||||
expanded_zones.append(pz)
|
|
||||||
continue
|
|
||||||
zone_words = _words_in_zone(
|
|
||||||
all_words, pz.y, pz.height, pz.x, pz.width
|
|
||||||
)
|
|
||||||
divider_xs = _detect_vertical_dividers(
|
|
||||||
zone_words, pz.x, pz.width, pz.y, pz.height
|
|
||||||
)
|
|
||||||
if divider_xs:
|
|
||||||
sub_zones = _split_zone_at_vertical_dividers(
|
|
||||||
pz, divider_xs, vsplit_group_counter
|
|
||||||
)
|
|
||||||
expanded_zones.extend(sub_zones)
|
|
||||||
vsplit_group_counter += 1
|
|
||||||
# Remove pipe words so they don't appear in sub-zones
|
|
||||||
pipe_ids = set(
|
|
||||||
id(w) for w in zone_words
|
|
||||||
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
|
||||||
)
|
|
||||||
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
|
|
||||||
logger.info(
|
|
||||||
"build-grid: vertical split zone %d at x=%s -> %d sub-zones",
|
|
||||||
pz.index, [int(x) for x in divider_xs], len(sub_zones),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
expanded_zones.append(pz)
|
|
||||||
# Re-index zones
|
|
||||||
for i, pz in enumerate(expanded_zones):
|
|
||||||
pz.index = i
|
|
||||||
return expanded_zones, False
|
|
||||||
|
|
||||||
|
|
||||||
def _build_grids_per_zone(
|
|
||||||
page_zones: List[PageZone],
|
|
||||||
all_words: List[Dict[str, Any]],
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
"""Build grids for each zone independently (first pass)."""
|
|
||||||
zone_grids: List[Dict] = []
|
|
||||||
|
|
||||||
for pz in page_zones:
|
|
||||||
zone_words = _words_in_zone(
|
|
||||||
all_words, pz.y, pz.height, pz.x, pz.width
|
|
||||||
)
|
|
||||||
if pz.zone_type == "content":
|
|
||||||
logger.info(
|
|
||||||
"build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
|
|
||||||
pz.index, pz.zone_type,
|
|
||||||
pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
|
|
||||||
len(zone_words), len(all_words),
|
|
||||||
)
|
|
||||||
# Filter recovered single-char artifacts in ALL zones
|
|
||||||
before = len(zone_words)
|
|
||||||
zone_words = [
|
|
||||||
w for w in zone_words
|
|
||||||
if not (
|
|
||||||
w.get("recovered")
|
|
||||||
and len(w.get("text", "").strip()) <= 2
|
|
||||||
)
|
|
||||||
]
|
|
||||||
removed = before - len(zone_words)
|
|
||||||
if removed:
|
|
||||||
logger.info(
|
|
||||||
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
|
||||||
removed, pz.zone_type, pz.index,
|
|
||||||
)
|
|
||||||
# Filter words inside image overlay regions (merged box zones)
|
|
||||||
if pz.image_overlays:
|
|
||||||
before_ov = len(zone_words)
|
|
||||||
zone_words = [
|
|
||||||
w for w in zone_words
|
|
||||||
if not any(
|
|
||||||
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
|
||||||
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
|
||||||
for ov in pz.image_overlays
|
|
||||||
)
|
|
||||||
]
|
|
||||||
ov_removed = before_ov - len(zone_words)
|
|
||||||
if ov_removed:
|
|
||||||
logger.info(
|
|
||||||
"build-grid: filtered %d words inside image overlays from zone %d",
|
|
||||||
ov_removed, pz.index,
|
|
||||||
)
|
|
||||||
zone_words, bs_removed = _filter_border_strip_words(zone_words)
|
|
||||||
bp = False
|
|
||||||
if bs_removed:
|
|
||||||
bp = True
|
|
||||||
logger.info(
|
|
||||||
"build-grid: pre-filtered %d border-strip words from zone %d",
|
|
||||||
bs_removed, pz.index,
|
|
||||||
)
|
|
||||||
grid = _build_zone_grid(
|
|
||||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
|
||||||
pz.index, img_w, img_h,
|
|
||||||
skip_first_row_header=bool(pz.image_overlays),
|
|
||||||
)
|
|
||||||
zone_grids.append({
|
|
||||||
"pz": pz, "words": zone_words, "grid": grid,
|
|
||||||
"_border_prefiltered": bp,
|
|
||||||
})
|
|
||||||
|
|
||||||
return zone_grids
|
|
||||||
|
|
||||||
|
|
||||||
def _merge_content_zone_columns(
|
|
||||||
zone_grids: List[Dict[str, Any]],
|
|
||||||
all_words: List[Dict[str, Any]],
|
|
||||||
content_w: int,
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
session_id: str,
|
|
||||||
) -> None:
|
|
||||||
"""Second pass: merge column boundaries from all content zones.
|
|
||||||
|
|
||||||
Modifies zone_grids in place.
|
|
||||||
"""
|
|
||||||
content_zones = [
|
|
||||||
zg for zg in zone_grids
|
|
||||||
if zg["pz"].zone_type == "content"
|
|
||||||
and zg["pz"].vsplit_group is None
|
|
||||||
]
|
|
||||||
if len(content_zones) <= 1:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Collect column split points (x_min of non-first columns)
|
|
||||||
all_split_xs: List[float] = []
|
|
||||||
for zg in content_zones:
|
|
||||||
raw_cols = zg["grid"].get("_raw_columns", [])
|
|
||||||
for col in raw_cols[1:]:
|
|
||||||
all_split_xs.append(col["x_min"])
|
|
||||||
|
|
||||||
if not all_split_xs:
|
|
||||||
return
|
|
||||||
|
|
||||||
all_split_xs.sort()
|
|
||||||
merge_distance = max(25, int(content_w * 0.03))
|
|
||||||
merged_xs = [all_split_xs[0]]
|
|
||||||
for x in all_split_xs[1:]:
|
|
||||||
if x - merged_xs[-1] < merge_distance:
|
|
||||||
merged_xs[-1] = (merged_xs[-1] + x) / 2
|
|
||||||
else:
|
|
||||||
merged_xs.append(x)
|
|
||||||
|
|
||||||
total_cols = len(merged_xs) + 1
|
|
||||||
max_zone_cols = max(
|
|
||||||
len(zg["grid"].get("_raw_columns", []))
|
|
||||||
for zg in content_zones
|
|
||||||
)
|
|
||||||
|
|
||||||
if total_cols < max_zone_cols:
|
|
||||||
return
|
|
||||||
|
|
||||||
cx_min = min(w["left"] for w in all_words)
|
|
||||||
cx_max = max(w["left"] + w["width"] for w in all_words)
|
|
||||||
merged_columns: List[Dict[str, Any]] = []
|
|
||||||
prev_x = cx_min
|
|
||||||
for i, sx in enumerate(merged_xs):
|
|
||||||
merged_columns.append({
|
|
||||||
"index": i,
|
|
||||||
"type": f"column_{i + 1}",
|
|
||||||
"x_min": prev_x,
|
|
||||||
"x_max": sx,
|
|
||||||
})
|
|
||||||
prev_x = sx
|
|
||||||
merged_columns.append({
|
|
||||||
"index": len(merged_xs),
|
|
||||||
"type": f"column_{len(merged_xs) + 1}",
|
|
||||||
"x_min": prev_x,
|
|
||||||
"x_max": cx_max,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Re-build ALL content zones with merged columns
|
|
||||||
for zg in zone_grids:
|
|
||||||
pz = zg["pz"]
|
|
||||||
if pz.zone_type == "content":
|
|
||||||
grid = _build_zone_grid(
|
|
||||||
zg["words"], pz.x, pz.y,
|
|
||||||
pz.width, pz.height,
|
|
||||||
pz.index, img_w, img_h,
|
|
||||||
global_columns=merged_columns,
|
|
||||||
skip_first_row_header=bool(pz.image_overlays),
|
|
||||||
)
|
|
||||||
zg["grid"] = grid
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: union of %d content "
|
|
||||||
"zones -> %d merged columns (max single zone: %d)",
|
|
||||||
session_id, len(content_zones),
|
|
||||||
total_cols, max_zone_cols,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,31 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/api.py
|
||||||
Grid Editor API — barrel re-export.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
The actual endpoints live in:
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.api")
|
||||||
- grid_editor_api_grid.py (build-grid, rerun-ocr, save-grid, get-grid)
|
|
||||||
- grid_editor_api_gutter.py (gutter-repair, gutter-repair/apply)
|
|
||||||
- grid_editor_api_box.py (build-box-grids)
|
|
||||||
- grid_editor_api_unified.py (build-unified-grid, unified-grid)
|
|
||||||
|
|
||||||
This module re-exports the combined router and key symbols so that
|
|
||||||
existing `from grid_editor_api import router` / `from grid_editor_api import _build_grid_core`
|
|
||||||
continue to work unchanged.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from fastapi import APIRouter
|
|
||||||
|
|
||||||
from grid_editor_api_grid import router as _grid_router
|
|
||||||
from grid_editor_api_gutter import router as _gutter_router
|
|
||||||
from grid_editor_api_box import router as _box_router
|
|
||||||
from grid_editor_api_unified import router as _unified_router
|
|
||||||
|
|
||||||
# Re-export _build_grid_core so callers that do
|
|
||||||
# `from grid_editor_api import _build_grid_core` keep working.
|
|
||||||
from grid_build_core import _build_grid_core # noqa: F401
|
|
||||||
|
|
||||||
# Merge all sub-routers into one combined router
|
|
||||||
router = APIRouter()
|
|
||||||
router.include_router(_grid_router)
|
|
||||||
router.include_router(_gutter_router)
|
|
||||||
router.include_router(_box_router)
|
|
||||||
router.include_router(_unified_router)
|
|
||||||
|
|||||||
@@ -1,177 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/api_box.py
|
||||||
Grid Editor API — box-grid-review endpoints.
|
import importlib as _importlib
|
||||||
"""
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.api_box")
|
||||||
import logging
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Request
|
|
||||||
|
|
||||||
from grid_editor_helpers import _words_in_zone
|
|
||||||
from ocr_pipeline_session_store import (
|
|
||||||
get_session_db,
|
|
||||||
update_session_db,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/build-box-grids")
|
|
||||||
async def build_box_grids(session_id: str, request: Request):
|
|
||||||
"""Rebuild grid structure for all detected boxes with layout-aware detection.
|
|
||||||
|
|
||||||
Uses structure_result.boxes (from Step 7) as the source of box coordinates,
|
|
||||||
and raw_paddle_words as OCR word source. Creates or updates box zones in
|
|
||||||
the grid_editor_result.
|
|
||||||
|
|
||||||
Optional body: { "overrides": { "0": "bullet_list" } }
|
|
||||||
Maps box_index -> forced layout_type.
|
|
||||||
"""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
grid_data = session.get("grid_editor_result")
|
|
||||||
if not grid_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
|
|
||||||
|
|
||||||
# Get raw OCR words (with top/left/width/height keys)
|
|
||||||
word_result = session.get("word_result") or {}
|
|
||||||
all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or []
|
|
||||||
if not all_words:
|
|
||||||
raise HTTPException(status_code=400, detail="No raw OCR words available.")
|
|
||||||
|
|
||||||
# Get detected boxes from structure_result
|
|
||||||
structure_result = session.get("structure_result") or {}
|
|
||||||
gt = session.get("ground_truth") or {}
|
|
||||||
if not structure_result:
|
|
||||||
structure_result = gt.get("structure_result") or {}
|
|
||||||
detected_boxes = structure_result.get("boxes") or []
|
|
||||||
if not detected_boxes:
|
|
||||||
return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
|
|
||||||
|
|
||||||
# Filter out false-positive boxes in header/footer margins.
|
|
||||||
img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
|
|
||||||
if img_h_for_filter > 0:
|
|
||||||
margin_frac = 0.07 # 7% of image height
|
|
||||||
margin_top = img_h_for_filter * margin_frac
|
|
||||||
margin_bottom = img_h_for_filter * (1 - margin_frac)
|
|
||||||
filtered = []
|
|
||||||
for box in detected_boxes:
|
|
||||||
by = box.get("y", 0)
|
|
||||||
bh = box.get("h", 0)
|
|
||||||
box_center_y = by + bh / 2
|
|
||||||
if box_center_y < margin_top or box_center_y > margin_bottom:
|
|
||||||
logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)",
|
|
||||||
by, bh, box_center_y, margin_top, margin_bottom)
|
|
||||||
continue
|
|
||||||
filtered.append(box)
|
|
||||||
detected_boxes = filtered
|
|
||||||
|
|
||||||
body = {}
|
|
||||||
try:
|
|
||||||
body = await request.json()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
layout_overrides = body.get("overrides", {})
|
|
||||||
|
|
||||||
from cv_box_layout import build_box_zone_grid
|
|
||||||
|
|
||||||
img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0)
|
|
||||||
img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
|
|
||||||
|
|
||||||
zones = grid_data.get("zones", [])
|
|
||||||
|
|
||||||
# Find highest existing zone_index
|
|
||||||
max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1)
|
|
||||||
|
|
||||||
# Remove old box zones (we'll rebuild them)
|
|
||||||
zones = [z for z in zones if z.get("zone_type") != "box"]
|
|
||||||
|
|
||||||
box_count = 0
|
|
||||||
spell_fixes = 0
|
|
||||||
|
|
||||||
for box_idx, box in enumerate(detected_boxes):
|
|
||||||
bx = box.get("x", 0)
|
|
||||||
by = box.get("y", 0)
|
|
||||||
bw = box.get("w", 0)
|
|
||||||
bh = box.get("h", 0)
|
|
||||||
|
|
||||||
if bw <= 0 or bh <= 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Filter raw OCR words inside this box
|
|
||||||
zone_words = _words_in_zone(all_words, by, bh, bx, bw)
|
|
||||||
if not zone_words:
|
|
||||||
logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh)
|
|
||||||
continue
|
|
||||||
|
|
||||||
zone_idx = max_zone_idx + 1 + box_idx
|
|
||||||
forced_layout = layout_overrides.get(str(box_idx))
|
|
||||||
|
|
||||||
# Build box grid
|
|
||||||
box_grid = build_box_zone_grid(
|
|
||||||
zone_words, bx, by, bw, bh,
|
|
||||||
zone_idx, img_w, img_h,
|
|
||||||
layout_type=forced_layout,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Apply SmartSpellChecker to all box cells
|
|
||||||
try:
|
|
||||||
from smart_spell import SmartSpellChecker
|
|
||||||
ssc = SmartSpellChecker()
|
|
||||||
for cell in box_grid.get("cells", []):
|
|
||||||
text = cell.get("text", "")
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
result = ssc.correct_text(text, lang="auto")
|
|
||||||
if result.changed:
|
|
||||||
cell["text"] = result.corrected
|
|
||||||
spell_fixes += 1
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Build zone entry
|
|
||||||
zone_entry = {
|
|
||||||
"zone_index": zone_idx,
|
|
||||||
"zone_type": "box",
|
|
||||||
"bbox_px": {"x": bx, "y": by, "w": bw, "h": bh},
|
|
||||||
"bbox_pct": {
|
|
||||||
"x": round(bx / img_w * 100, 2) if img_w else 0,
|
|
||||||
"y": round(by / img_h * 100, 2) if img_h else 0,
|
|
||||||
"w": round(bw / img_w * 100, 2) if img_w else 0,
|
|
||||||
"h": round(bh / img_h * 100, 2) if img_h else 0,
|
|
||||||
},
|
|
||||||
"border": None,
|
|
||||||
"word_count": len(zone_words),
|
|
||||||
"columns": box_grid["columns"],
|
|
||||||
"rows": box_grid["rows"],
|
|
||||||
"cells": box_grid["cells"],
|
|
||||||
"header_rows": box_grid.get("header_rows", []),
|
|
||||||
"box_layout_type": box_grid.get("box_layout_type", "flowing"),
|
|
||||||
"box_grid_reviewed": False,
|
|
||||||
"box_bg_color": box.get("bg_color_name", ""),
|
|
||||||
"box_bg_hex": box.get("bg_color_hex", ""),
|
|
||||||
}
|
|
||||||
zones.append(zone_entry)
|
|
||||||
box_count += 1
|
|
||||||
|
|
||||||
# Sort zones by y-position for correct reading order
|
|
||||||
zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0))
|
|
||||||
|
|
||||||
grid_data["zones"] = zones
|
|
||||||
await update_session_db(session_id, grid_editor_result=grid_data)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected",
|
|
||||||
session_id, box_count, spell_fixes, len(detected_boxes),
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"session_id": session_id,
|
|
||||||
"box_zones_rebuilt": box_count,
|
|
||||||
"total_detected_boxes": len(detected_boxes),
|
|
||||||
"spell_fixes": spell_fixes,
|
|
||||||
"zones": zones,
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,337 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/api_grid.py
|
||||||
Grid Editor API — grid build, save, and retrieve endpoints.
|
import importlib as _importlib
|
||||||
"""
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.api_grid")
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Query, Request
|
|
||||||
|
|
||||||
from grid_build_core import _build_grid_core
|
|
||||||
from ocr_pipeline_session_store import (
|
|
||||||
get_session_db,
|
|
||||||
update_session_db,
|
|
||||||
)
|
|
||||||
from ocr_pipeline_common import (
|
|
||||||
_cache,
|
|
||||||
_load_session_to_cache,
|
|
||||||
_get_cached,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/build-grid")
|
|
||||||
async def build_grid(
|
|
||||||
session_id: str,
|
|
||||||
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
|
||||||
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
|
||||||
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
|
|
||||||
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
|
|
||||||
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
|
|
||||||
):
|
|
||||||
"""Build a structured, zone-aware grid from existing Kombi word results.
|
|
||||||
|
|
||||||
Requires that paddle-kombi or rapid-kombi has already been run on the session.
|
|
||||||
Uses the image for box detection and the word positions for grid structuring.
|
|
||||||
|
|
||||||
Query params:
|
|
||||||
ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip)
|
|
||||||
syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip)
|
|
||||||
|
|
||||||
Returns a StructuredGrid with zones, each containing their own
|
|
||||||
columns, rows, and cells — ready for the frontend Excel-like editor.
|
|
||||||
"""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = await _build_grid_core(
|
|
||||||
session_id, session,
|
|
||||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
|
||||||
enhance=enhance,
|
|
||||||
max_columns=max_cols if max_cols > 0 else None,
|
|
||||||
min_conf=min_conf if min_conf > 0 else None,
|
|
||||||
)
|
|
||||||
except ValueError as e:
|
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
|
||||||
|
|
||||||
# Save automatic grid snapshot for later comparison with manual corrections
|
|
||||||
# Lazy import to avoid circular dependency with ocr_pipeline_regression
|
|
||||||
from ocr_pipeline_regression import _build_reference_snapshot
|
|
||||||
|
|
||||||
wr = session.get("word_result") or {}
|
|
||||||
engine = wr.get("ocr_engine", "")
|
|
||||||
if engine in ("kombi", "rapid_kombi"):
|
|
||||||
auto_pipeline = "kombi"
|
|
||||||
elif engine == "paddle_direct":
|
|
||||||
auto_pipeline = "paddle-direct"
|
|
||||||
else:
|
|
||||||
auto_pipeline = "pipeline"
|
|
||||||
auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline)
|
|
||||||
|
|
||||||
gt = session.get("ground_truth") or {}
|
|
||||||
gt["auto_grid_snapshot"] = auto_snapshot
|
|
||||||
|
|
||||||
# Persist to DB and advance current_step to 11 (reconstruction complete)
|
|
||||||
await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"build-grid session %s: %d zones, %d cols, %d rows, %d cells, "
|
|
||||||
"%d boxes in %.2fs",
|
|
||||||
session_id,
|
|
||||||
len(result.get("zones", [])),
|
|
||||||
result.get("summary", {}).get("total_columns", 0),
|
|
||||||
result.get("summary", {}).get("total_rows", 0),
|
|
||||||
result.get("summary", {}).get("total_cells", 0),
|
|
||||||
result.get("boxes_detected", 0),
|
|
||||||
result.get("duration_seconds", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid")
|
|
||||||
async def rerun_ocr_and_build_grid(
|
|
||||||
session_id: str,
|
|
||||||
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
|
||||||
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
|
||||||
enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"),
|
|
||||||
max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"),
|
|
||||||
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"),
|
|
||||||
vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"),
|
|
||||||
doc_category: str = Query("", description="Document type for Vision-LLM prompt context"),
|
|
||||||
):
|
|
||||||
"""Re-run OCR with quality settings, then rebuild the grid.
|
|
||||||
|
|
||||||
Unlike build-grid (which only rebuilds from existing words),
|
|
||||||
this endpoint re-runs the full OCR pipeline on the cropped image
|
|
||||||
with optional CLAHE enhancement, then builds the grid.
|
|
||||||
|
|
||||||
Steps executed: Image Enhancement -> OCR -> Grid Build
|
|
||||||
"""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
import time as _time
|
|
||||||
t0 = _time.time()
|
|
||||||
|
|
||||||
# 1. Load the cropped/dewarped image from cache or session
|
|
||||||
if session_id not in _cache:
|
|
||||||
await _load_session_to_cache(session_id)
|
|
||||||
cached = _get_cached(session_id)
|
|
||||||
|
|
||||||
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
|
||||||
if dewarped_bgr is None:
|
|
||||||
raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.")
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
img_h, img_w = dewarped_bgr.shape[:2]
|
|
||||||
ocr_input = dewarped_bgr.copy()
|
|
||||||
|
|
||||||
# 2. Scan quality assessment
|
|
||||||
scan_quality_info = {}
|
|
||||||
try:
|
|
||||||
from scan_quality import score_scan_quality
|
|
||||||
quality_report = score_scan_quality(ocr_input)
|
|
||||||
scan_quality_info = quality_report.to_dict()
|
|
||||||
actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"rerun-ocr: scan quality failed: {e}")
|
|
||||||
actual_min_conf = min_conf if min_conf > 0 else 40
|
|
||||||
|
|
||||||
# 3. Image enhancement (Step 3)
|
|
||||||
is_degraded = scan_quality_info.get("is_degraded", False)
|
|
||||||
if enhance and is_degraded:
|
|
||||||
try:
|
|
||||||
from ocr_image_enhance import enhance_for_ocr
|
|
||||||
ocr_input = enhance_for_ocr(ocr_input, is_degraded=True)
|
|
||||||
logger.info("rerun-ocr: CLAHE enhancement applied")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"rerun-ocr: enhancement failed: {e}")
|
|
||||||
|
|
||||||
# 4. Run dual-engine OCR
|
|
||||||
from PIL import Image
|
|
||||||
import pytesseract
|
|
||||||
|
|
||||||
# RapidOCR
|
|
||||||
rapid_words = []
|
|
||||||
try:
|
|
||||||
from cv_ocr_engines import ocr_region_rapid
|
|
||||||
from cv_vocab_types import PageRegion
|
|
||||||
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
|
|
||||||
rapid_words = ocr_region_rapid(ocr_input, full_region) or []
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"rerun-ocr: RapidOCR failed: {e}")
|
|
||||||
|
|
||||||
# Tesseract
|
|
||||||
pil_img = Image.fromarray(ocr_input[:, :, ::-1])
|
|
||||||
data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT)
|
|
||||||
tess_words = []
|
|
||||||
for i in range(len(data["text"])):
|
|
||||||
text = (data["text"][i] or "").strip()
|
|
||||||
conf_raw = str(data["conf"][i])
|
|
||||||
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
|
||||||
if not text or conf < actual_min_conf:
|
|
||||||
continue
|
|
||||||
tess_words.append({
|
|
||||||
"text": text, "left": data["left"][i], "top": data["top"][i],
|
|
||||||
"width": data["width"][i], "height": data["height"][i], "conf": conf,
|
|
||||||
})
|
|
||||||
|
|
||||||
# 5. Merge OCR results
|
|
||||||
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
|
|
||||||
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
|
|
||||||
if rapid_split or tess_words:
|
|
||||||
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
|
|
||||||
merged_words = _deduplicate_words(merged_words)
|
|
||||||
else:
|
|
||||||
merged_words = tess_words
|
|
||||||
|
|
||||||
# 6. Store updated word_result in session
|
|
||||||
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
|
|
||||||
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
|
|
||||||
for w in merged_words]
|
|
||||||
word_result = {
|
|
||||||
"cells": [{"text": " ".join(w["text"] for w in merged_words),
|
|
||||||
"word_boxes": cells_for_storage}],
|
|
||||||
"image_width": img_w,
|
|
||||||
"image_height": img_h,
|
|
||||||
"ocr_engine": "rapid_kombi",
|
|
||||||
"word_count": len(merged_words),
|
|
||||||
"raw_paddle_words": rapid_words,
|
|
||||||
}
|
|
||||||
# 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model
|
|
||||||
vision_applied = False
|
|
||||||
if vision_fusion:
|
|
||||||
try:
|
|
||||||
from vision_ocr_fusion import vision_fuse_ocr
|
|
||||||
category = doc_category or session.get("document_category") or "vokabelseite"
|
|
||||||
logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})")
|
|
||||||
merged_words = await vision_fuse_ocr(ocr_input, merged_words, category)
|
|
||||||
vision_applied = True
|
|
||||||
# Rebuild storage from fused words
|
|
||||||
cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"],
|
|
||||||
"width": w["width"], "height": w["height"], "conf": w.get("conf", 0)}
|
|
||||||
for w in merged_words]
|
|
||||||
word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words),
|
|
||||||
"word_boxes": cells_for_storage}]
|
|
||||||
word_result["word_count"] = len(merged_words)
|
|
||||||
word_result["ocr_engine"] = "vision_fusion"
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}")
|
|
||||||
|
|
||||||
await update_session_db(session_id, word_result=word_result)
|
|
||||||
|
|
||||||
# Reload session with updated word_result
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
|
|
||||||
ocr_duration = _time.time() - t0
|
|
||||||
logger.info(
|
|
||||||
"rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs "
|
|
||||||
"(enhance=%s, min_conf=%d, quality=%s)",
|
|
||||||
session_id, len(merged_words), len(rapid_words), len(tess_words),
|
|
||||||
len(merged_words), ocr_duration, enhance, actual_min_conf,
|
|
||||||
scan_quality_info.get("quality_pct", "?"),
|
|
||||||
)
|
|
||||||
|
|
||||||
# 7. Build grid from new words
|
|
||||||
try:
|
|
||||||
result = await _build_grid_core(
|
|
||||||
session_id, session,
|
|
||||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
|
||||||
enhance=enhance,
|
|
||||||
max_columns=max_cols if max_cols > 0 else None,
|
|
||||||
min_conf=min_conf if min_conf > 0 else None,
|
|
||||||
)
|
|
||||||
except ValueError as e:
|
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
|
||||||
|
|
||||||
# Persist grid
|
|
||||||
await update_session_db(session_id, grid_editor_result=result, current_step=11)
|
|
||||||
|
|
||||||
# Add quality info to response
|
|
||||||
result["scan_quality"] = scan_quality_info
|
|
||||||
result["ocr_stats"] = {
|
|
||||||
"rapid_words": len(rapid_words),
|
|
||||||
"tess_words": len(tess_words),
|
|
||||||
"merged_words": len(merged_words),
|
|
||||||
"min_conf_used": actual_min_conf,
|
|
||||||
"enhance_applied": enhance and is_degraded,
|
|
||||||
"vision_fusion_applied": vision_applied,
|
|
||||||
"document_category": doc_category or session.get("document_category", ""),
|
|
||||||
"ocr_duration_seconds": round(ocr_duration, 1),
|
|
||||||
}
|
|
||||||
|
|
||||||
total_duration = _time.time() - t0
|
|
||||||
logger.info(
|
|
||||||
"rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs",
|
|
||||||
session_id,
|
|
||||||
len(result.get("zones", [])),
|
|
||||||
result.get("summary", {}).get("total_columns", 0),
|
|
||||||
result.get("summary", {}).get("total_cells", 0),
|
|
||||||
total_duration,
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/save-grid")
|
|
||||||
async def save_grid(session_id: str, request: Request):
|
|
||||||
"""Save edited grid data from the frontend Excel-like editor.
|
|
||||||
|
|
||||||
Receives the full StructuredGrid with user edits (text changes,
|
|
||||||
formatting changes like bold columns, header rows, etc.) and
|
|
||||||
persists it to the session's grid_editor_result.
|
|
||||||
"""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
body = await request.json()
|
|
||||||
|
|
||||||
# Validate basic structure
|
|
||||||
if "zones" not in body:
|
|
||||||
raise HTTPException(status_code=400, detail="Missing 'zones' in request body")
|
|
||||||
|
|
||||||
# Preserve metadata from the original build
|
|
||||||
existing = session.get("grid_editor_result") or {}
|
|
||||||
result = {
|
|
||||||
"session_id": session_id,
|
|
||||||
"image_width": body.get("image_width", existing.get("image_width", 0)),
|
|
||||||
"image_height": body.get("image_height", existing.get("image_height", 0)),
|
|
||||||
"zones": body["zones"],
|
|
||||||
"boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)),
|
|
||||||
"summary": body.get("summary", existing.get("summary", {})),
|
|
||||||
"formatting": body.get("formatting", existing.get("formatting", {})),
|
|
||||||
"duration_seconds": existing.get("duration_seconds", 0),
|
|
||||||
"edited": True,
|
|
||||||
}
|
|
||||||
|
|
||||||
await update_session_db(session_id, grid_editor_result=result, current_step=11)
|
|
||||||
|
|
||||||
logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"]))
|
|
||||||
|
|
||||||
return {"session_id": session_id, "saved": True}
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/sessions/{session_id}/grid-editor")
|
|
||||||
async def get_grid(session_id: str):
|
|
||||||
"""Retrieve the current grid editor state for a session."""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
result = session.get("grid_editor_result")
|
|
||||||
if not result:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=404,
|
|
||||||
detail="No grid editor data. Run build-grid first.",
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
@@ -1,110 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/api_gutter.py
|
||||||
Grid Editor API — gutter repair endpoints.
|
import importlib as _importlib
|
||||||
"""
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.api_gutter")
|
||||||
import logging
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Request
|
|
||||||
|
|
||||||
from ocr_pipeline_session_store import (
|
|
||||||
get_session_db,
|
|
||||||
update_session_db,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/gutter-repair")
|
|
||||||
async def gutter_repair(session_id: str):
|
|
||||||
"""Analyse grid for gutter-edge OCR errors and return repair suggestions.
|
|
||||||
|
|
||||||
Detects:
|
|
||||||
- Words truncated/blurred at the book binding (spell_fix)
|
|
||||||
- Words split across rows with missing hyphen chars (hyphen_join)
|
|
||||||
"""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
grid_data = session.get("grid_editor_result")
|
|
||||||
if not grid_data:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail="No grid data. Run build-grid first.",
|
|
||||||
)
|
|
||||||
|
|
||||||
from cv_gutter_repair import analyse_grid_for_gutter_repair
|
|
||||||
|
|
||||||
image_width = grid_data.get("image_width", 0)
|
|
||||||
result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width)
|
|
||||||
|
|
||||||
# Persist suggestions in ground_truth.gutter_repair (avoids DB migration)
|
|
||||||
gt = session.get("ground_truth") or {}
|
|
||||||
gt["gutter_repair"] = result
|
|
||||||
await update_session_db(session_id, ground_truth=gt)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"gutter-repair session %s: %d suggestions in %.2fs",
|
|
||||||
session_id,
|
|
||||||
result.get("stats", {}).get("suggestions_found", 0),
|
|
||||||
result.get("duration_seconds", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/gutter-repair/apply")
|
|
||||||
async def gutter_repair_apply(session_id: str, request: Request):
|
|
||||||
"""Apply accepted gutter repair suggestions to the grid.
|
|
||||||
|
|
||||||
Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] }
|
|
||||||
"""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
grid_data = session.get("grid_editor_result")
|
|
||||||
if not grid_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No grid data.")
|
|
||||||
|
|
||||||
gt = session.get("ground_truth") or {}
|
|
||||||
gutter_result = gt.get("gutter_repair")
|
|
||||||
if not gutter_result:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail="No gutter repair data. Run gutter-repair first.",
|
|
||||||
)
|
|
||||||
|
|
||||||
body = await request.json()
|
|
||||||
accepted_ids = body.get("accepted", [])
|
|
||||||
if not accepted_ids:
|
|
||||||
return {"applied_count": 0, "changes": []}
|
|
||||||
|
|
||||||
# text_overrides: { suggestion_id: "alternative_text" }
|
|
||||||
# Allows the user to pick a different correction from the alternatives list
|
|
||||||
text_overrides = body.get("text_overrides", {})
|
|
||||||
|
|
||||||
from cv_gutter_repair import apply_gutter_suggestions
|
|
||||||
|
|
||||||
suggestions = gutter_result.get("suggestions", [])
|
|
||||||
|
|
||||||
# Apply user-selected alternatives before passing to apply
|
|
||||||
for s in suggestions:
|
|
||||||
sid = s.get("id", "")
|
|
||||||
if sid in text_overrides and text_overrides[sid]:
|
|
||||||
s["suggested_text"] = text_overrides[sid]
|
|
||||||
|
|
||||||
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
|
|
||||||
|
|
||||||
# Save updated grid back to session
|
|
||||||
await update_session_db(session_id, grid_editor_result=grid_data)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"gutter-repair/apply session %s: %d changes applied",
|
|
||||||
session_id,
|
|
||||||
result.get("applied_count", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
@@ -1,71 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/api_unified.py
|
||||||
Grid Editor API — unified grid endpoints.
|
import importlib as _importlib
|
||||||
"""
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.api_unified")
|
||||||
import logging
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException
|
|
||||||
|
|
||||||
from ocr_pipeline_session_store import (
|
|
||||||
get_session_db,
|
|
||||||
update_session_db,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/build-unified-grid")
|
|
||||||
async def build_unified_grid_endpoint(session_id: str):
|
|
||||||
"""Build a single-zone unified grid merging content + box zones.
|
|
||||||
|
|
||||||
Takes the existing multi-zone grid_editor_result and produces a
|
|
||||||
unified grid where boxes are integrated into the main row sequence.
|
|
||||||
Persists as unified_grid_result (preserves original multi-zone data).
|
|
||||||
"""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
grid_data = session.get("grid_editor_result")
|
|
||||||
if not grid_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
|
|
||||||
|
|
||||||
from unified_grid import build_unified_grid
|
|
||||||
|
|
||||||
result = build_unified_grid(
|
|
||||||
zones=grid_data.get("zones", []),
|
|
||||||
image_width=grid_data.get("image_width", 0),
|
|
||||||
image_height=grid_data.get("image_height", 0),
|
|
||||||
layout_metrics=grid_data.get("layout_metrics", {}),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Persist as separate field (don't overwrite original multi-zone grid)
|
|
||||||
await update_session_db(session_id, unified_grid_result=result)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"build-unified-grid session %s: %d rows, %d cells",
|
|
||||||
session_id,
|
|
||||||
result.get("summary", {}).get("total_rows", 0),
|
|
||||||
result.get("summary", {}).get("total_cells", 0),
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/sessions/{session_id}/unified-grid")
|
|
||||||
async def get_unified_grid(session_id: str):
|
|
||||||
"""Retrieve the unified grid for a session."""
|
|
||||||
session = await get_session_db(session_id)
|
|
||||||
if not session:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
||||||
|
|
||||||
result = session.get("unified_grid_result")
|
|
||||||
if not result:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=404,
|
|
||||||
detail="No unified grid. Run build-unified-grid first.",
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
@@ -1,492 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/columns.py
|
||||||
Grid Editor — column detection, cross-column splitting, marker merging.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Split from grid_editor_helpers.py for maintainability.
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.columns")
|
||||||
All functions are pure computation — no HTTP, DB, or session side effects.
|
|
||||||
|
|
||||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
||||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Cross-column word splitting
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
_spell_cache: Optional[Any] = None
|
|
||||||
_spell_loaded = False
|
|
||||||
|
|
||||||
|
|
||||||
def _is_recognized_word(text: str) -> bool:
|
|
||||||
"""Check if *text* is a recognized German or English word.
|
|
||||||
|
|
||||||
Uses the spellchecker library (same as cv_syllable_detect.py).
|
|
||||||
Returns True for real words like "oder", "Kabel", "Zeitung".
|
|
||||||
Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
|
|
||||||
"""
|
|
||||||
global _spell_cache, _spell_loaded
|
|
||||||
if not text or len(text) < 2:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if not _spell_loaded:
|
|
||||||
_spell_loaded = True
|
|
||||||
try:
|
|
||||||
from spellchecker import SpellChecker
|
|
||||||
_spell_cache = SpellChecker(language="de")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if _spell_cache is None:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return text.lower() in _spell_cache
|
|
||||||
|
|
||||||
|
|
||||||
def _split_cross_column_words(
|
|
||||||
words: List[Dict],
|
|
||||||
columns: List[Dict],
|
|
||||||
) -> List[Dict]:
|
|
||||||
"""Split word boxes that span across column boundaries.
|
|
||||||
|
|
||||||
When OCR merges adjacent words from different columns (e.g. "sichzie"
|
|
||||||
spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
|
|
||||||
split the word box at the column boundary so each piece is assigned
|
|
||||||
to the correct column.
|
|
||||||
|
|
||||||
Only splits when:
|
|
||||||
- The word has significant overlap (>15% of its width) on both sides
|
|
||||||
- AND the word is not a recognized real word (OCR merge artifact), OR
|
|
||||||
the word contains a case transition (lowercase->uppercase) near the
|
|
||||||
boundary indicating two merged words like "dasZimmer".
|
|
||||||
"""
|
|
||||||
if len(columns) < 2:
|
|
||||||
return words
|
|
||||||
|
|
||||||
# Column boundaries = midpoints between adjacent column edges
|
|
||||||
boundaries = []
|
|
||||||
for i in range(len(columns) - 1):
|
|
||||||
boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
|
|
||||||
boundaries.append(boundary)
|
|
||||||
|
|
||||||
new_words: List[Dict] = []
|
|
||||||
split_count = 0
|
|
||||||
for w in words:
|
|
||||||
w_left = w["left"]
|
|
||||||
w_width = w["width"]
|
|
||||||
w_right = w_left + w_width
|
|
||||||
text = (w.get("text") or "").strip()
|
|
||||||
|
|
||||||
if not text or len(text) < 4 or w_width < 10:
|
|
||||||
new_words.append(w)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Find the first boundary this word straddles significantly
|
|
||||||
split_boundary = None
|
|
||||||
for b in boundaries:
|
|
||||||
if w_left < b < w_right:
|
|
||||||
left_part = b - w_left
|
|
||||||
right_part = w_right - b
|
|
||||||
# Both sides must have at least 15% of the word width
|
|
||||||
if left_part > w_width * 0.15 and right_part > w_width * 0.15:
|
|
||||||
split_boundary = b
|
|
||||||
break
|
|
||||||
|
|
||||||
if split_boundary is None:
|
|
||||||
new_words.append(w)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Compute approximate split position in the text.
|
|
||||||
left_width = split_boundary - w_left
|
|
||||||
split_ratio = left_width / w_width
|
|
||||||
approx_pos = len(text) * split_ratio
|
|
||||||
|
|
||||||
# Strategy 1: look for a case transition (lowercase->uppercase) near
|
|
||||||
# the approximate split point — e.g. "dasZimmer" splits at 'Z'.
|
|
||||||
split_char = None
|
|
||||||
search_lo = max(1, int(approx_pos) - 3)
|
|
||||||
search_hi = min(len(text), int(approx_pos) + 2)
|
|
||||||
for i in range(search_lo, search_hi):
|
|
||||||
if text[i - 1].islower() and text[i].isupper():
|
|
||||||
split_char = i
|
|
||||||
break
|
|
||||||
|
|
||||||
# Strategy 2: if no case transition, only split if the whole word
|
|
||||||
# is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
|
|
||||||
# Real words like "oder", "Kabel", "Zeitung" must not be split.
|
|
||||||
if split_char is None:
|
|
||||||
clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct
|
|
||||||
if _is_recognized_word(clean):
|
|
||||||
new_words.append(w)
|
|
||||||
continue
|
|
||||||
# Not a real word — use floor of proportional position
|
|
||||||
split_char = max(1, min(len(text) - 1, int(approx_pos)))
|
|
||||||
|
|
||||||
left_text = text[:split_char].rstrip()
|
|
||||||
right_text = text[split_char:].lstrip()
|
|
||||||
|
|
||||||
if len(left_text) < 2 or len(right_text) < 2:
|
|
||||||
new_words.append(w)
|
|
||||||
continue
|
|
||||||
|
|
||||||
right_width = w_width - round(left_width)
|
|
||||||
new_words.append({
|
|
||||||
**w,
|
|
||||||
"text": left_text,
|
|
||||||
"width": round(left_width),
|
|
||||||
})
|
|
||||||
new_words.append({
|
|
||||||
**w,
|
|
||||||
"text": right_text,
|
|
||||||
"left": round(split_boundary),
|
|
||||||
"width": right_width,
|
|
||||||
})
|
|
||||||
split_count += 1
|
|
||||||
logger.info(
|
|
||||||
"split cross-column word %r -> %r + %r at boundary %.0f",
|
|
||||||
text, left_text, right_text, split_boundary,
|
|
||||||
)
|
|
||||||
|
|
||||||
if split_count:
|
|
||||||
logger.info("split %d cross-column word(s)", split_count)
|
|
||||||
return new_words
|
|
||||||
|
|
||||||
|
|
||||||
def _cluster_columns_by_alignment(
|
|
||||||
words: List[Dict],
|
|
||||||
zone_w: int,
|
|
||||||
rows: List[Dict],
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
"""Detect columns by clustering left-edge alignment across rows.
|
|
||||||
|
|
||||||
Hybrid approach:
|
|
||||||
1. Group words by row, find "group start" positions within each row
|
|
||||||
(words preceded by a large gap or first word in row)
|
|
||||||
2. Cluster group-start left-edges by X-proximity across rows
|
|
||||||
3. Filter by row coverage (how many rows have a group start here)
|
|
||||||
4. Merge nearby clusters
|
|
||||||
5. Build column boundaries
|
|
||||||
|
|
||||||
This filters out mid-phrase word positions (e.g. IPA transcriptions,
|
|
||||||
second words in multi-word entries) by only considering positions
|
|
||||||
where a new word group begins within a row.
|
|
||||||
"""
|
|
||||||
if not words or not rows:
|
|
||||||
return []
|
|
||||||
|
|
||||||
total_rows = len(rows)
|
|
||||||
if total_rows == 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# --- Group words by row ---
|
|
||||||
row_words: Dict[int, List[Dict]] = {}
|
|
||||||
for w in words:
|
|
||||||
y_center = w["top"] + w["height"] / 2
|
|
||||||
best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
|
|
||||||
row_words.setdefault(best["index"], []).append(w)
|
|
||||||
|
|
||||||
# --- Compute adaptive gap threshold for group-start detection ---
|
|
||||||
all_gaps: List[float] = []
|
|
||||||
for ri, rw_list in row_words.items():
|
|
||||||
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
|
|
||||||
for i in range(len(sorted_rw) - 1):
|
|
||||||
right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
|
|
||||||
gap = sorted_rw[i + 1]["left"] - right
|
|
||||||
if gap > 0:
|
|
||||||
all_gaps.append(gap)
|
|
||||||
|
|
||||||
if all_gaps:
|
|
||||||
sorted_gaps = sorted(all_gaps)
|
|
||||||
median_gap = sorted_gaps[len(sorted_gaps) // 2]
|
|
||||||
heights = [w["height"] for w in words if w.get("height", 0) > 0]
|
|
||||||
median_h = sorted(heights)[len(heights) // 2] if heights else 25
|
|
||||||
|
|
||||||
# For small word counts (boxes, sub-zones): PaddleOCR returns
|
|
||||||
# multi-word blocks, so ALL inter-word gaps are potential column
|
|
||||||
# boundaries. Use a low threshold based on word height — any gap
|
|
||||||
# wider than ~1x median word height is a column separator.
|
|
||||||
if len(words) <= 60:
|
|
||||||
gap_threshold = max(median_h * 1.0, 25)
|
|
||||||
logger.info(
|
|
||||||
"alignment columns (small zone): gap_threshold=%.0f "
|
|
||||||
"(median_h=%.0f, %d words, %d gaps: %s)",
|
|
||||||
gap_threshold, median_h, len(words), len(sorted_gaps),
|
|
||||||
[int(g) for g in sorted_gaps[:10]],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Standard approach for large zones (full pages)
|
|
||||||
gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
|
|
||||||
# Cap at 25% of zone width
|
|
||||||
max_gap = zone_w * 0.25
|
|
||||||
if gap_threshold > max_gap > 30:
|
|
||||||
logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w)
|
|
||||||
gap_threshold = max_gap
|
|
||||||
else:
|
|
||||||
gap_threshold = 50
|
|
||||||
|
|
||||||
# --- Find group-start positions (left-edges that begin a new column) ---
|
|
||||||
start_positions: List[tuple] = [] # (left_edge, row_index)
|
|
||||||
for ri, rw_list in row_words.items():
|
|
||||||
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
|
|
||||||
# First word in row is always a group start
|
|
||||||
start_positions.append((sorted_rw[0]["left"], ri))
|
|
||||||
for i in range(1, len(sorted_rw)):
|
|
||||||
right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
|
|
||||||
gap = sorted_rw[i]["left"] - right_prev
|
|
||||||
if gap >= gap_threshold:
|
|
||||||
start_positions.append((sorted_rw[i]["left"], ri))
|
|
||||||
|
|
||||||
start_positions.sort(key=lambda x: x[0])
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"alignment columns: %d group-start positions from %d words "
|
|
||||||
"(gap_threshold=%.0f, %d rows)",
|
|
||||||
len(start_positions), len(words), gap_threshold, total_rows,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not start_positions:
|
|
||||||
x_min = min(w["left"] for w in words)
|
|
||||||
x_max = max(w["left"] + w["width"] for w in words)
|
|
||||||
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
|
|
||||||
|
|
||||||
# --- Cluster group-start positions by X-proximity ---
|
|
||||||
tolerance = max(10, int(zone_w * 0.01))
|
|
||||||
clusters: List[Dict[str, Any]] = []
|
|
||||||
cur_edges = [start_positions[0][0]]
|
|
||||||
cur_rows = {start_positions[0][1]}
|
|
||||||
|
|
||||||
for left, row_idx in start_positions[1:]:
|
|
||||||
if left - cur_edges[-1] <= tolerance:
|
|
||||||
cur_edges.append(left)
|
|
||||||
cur_rows.add(row_idx)
|
|
||||||
else:
|
|
||||||
clusters.append({
|
|
||||||
"mean_x": int(sum(cur_edges) / len(cur_edges)),
|
|
||||||
"min_edge": min(cur_edges),
|
|
||||||
"max_edge": max(cur_edges),
|
|
||||||
"count": len(cur_edges),
|
|
||||||
"distinct_rows": len(cur_rows),
|
|
||||||
"row_coverage": len(cur_rows) / total_rows,
|
|
||||||
})
|
|
||||||
cur_edges = [left]
|
|
||||||
cur_rows = {row_idx}
|
|
||||||
clusters.append({
|
|
||||||
"mean_x": int(sum(cur_edges) / len(cur_edges)),
|
|
||||||
"min_edge": min(cur_edges),
|
|
||||||
"max_edge": max(cur_edges),
|
|
||||||
"count": len(cur_edges),
|
|
||||||
"distinct_rows": len(cur_rows),
|
|
||||||
"row_coverage": len(cur_rows) / total_rows,
|
|
||||||
})
|
|
||||||
|
|
||||||
# --- Filter by row coverage ---
|
|
||||||
# These thresholds must be high enough to avoid false columns in flowing
|
|
||||||
# text (random inter-word gaps) while still detecting real columns in
|
|
||||||
# vocabulary worksheets (which typically have >80% row coverage).
|
|
||||||
MIN_COVERAGE_PRIMARY = 0.35
|
|
||||||
MIN_COVERAGE_SECONDARY = 0.12
|
|
||||||
MIN_WORDS_SECONDARY = 4
|
|
||||||
MIN_DISTINCT_ROWS = 3
|
|
||||||
|
|
||||||
# Content boundary for left-margin detection
|
|
||||||
content_x_min = min(w["left"] for w in words)
|
|
||||||
content_x_max = max(w["left"] + w["width"] for w in words)
|
|
||||||
content_span = content_x_max - content_x_min
|
|
||||||
|
|
||||||
primary = [
|
|
||||||
c for c in clusters
|
|
||||||
if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
|
|
||||||
and c["distinct_rows"] >= MIN_DISTINCT_ROWS
|
|
||||||
]
|
|
||||||
primary_ids = {id(c) for c in primary}
|
|
||||||
secondary = [
|
|
||||||
c for c in clusters
|
|
||||||
if id(c) not in primary_ids
|
|
||||||
and c["row_coverage"] >= MIN_COVERAGE_SECONDARY
|
|
||||||
and c["count"] >= MIN_WORDS_SECONDARY
|
|
||||||
and c["distinct_rows"] >= MIN_DISTINCT_ROWS
|
|
||||||
]
|
|
||||||
|
|
||||||
# Tertiary: narrow left-margin columns (page refs, markers) that have
|
|
||||||
# too few rows for secondary but are clearly left-aligned and separated
|
|
||||||
# from the main content. These appear at the far left or far right and
|
|
||||||
# have a large gap to the nearest significant cluster.
|
|
||||||
used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
|
|
||||||
sig_xs = [c["mean_x"] for c in primary + secondary]
|
|
||||||
|
|
||||||
# Tertiary: clusters that are clearly to the LEFT of the first
|
|
||||||
# significant column (or RIGHT of the last). If words consistently
|
|
||||||
# start at a position left of the established first column boundary,
|
|
||||||
# they MUST be a separate column — regardless of how few rows they
|
|
||||||
# cover. The only requirement is a clear spatial gap.
|
|
||||||
MIN_COVERAGE_TERTIARY = 0.02 # at least 1 row effectively
|
|
||||||
tertiary = []
|
|
||||||
for c in clusters:
|
|
||||||
if id(c) in used_ids:
|
|
||||||
continue
|
|
||||||
if c["distinct_rows"] < 1:
|
|
||||||
continue
|
|
||||||
if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
|
|
||||||
continue
|
|
||||||
# Must be near left or right content margin (within 15%)
|
|
||||||
rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
|
|
||||||
if not (rel_pos < 0.15 or rel_pos > 0.85):
|
|
||||||
continue
|
|
||||||
# Must have significant gap to nearest significant cluster
|
|
||||||
if sig_xs:
|
|
||||||
min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
|
|
||||||
if min_dist < max(30, content_span * 0.02):
|
|
||||||
continue
|
|
||||||
tertiary.append(c)
|
|
||||||
|
|
||||||
if tertiary:
|
|
||||||
for c in tertiary:
|
|
||||||
logger.info(
|
|
||||||
" tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
|
|
||||||
c["mean_x"], c["min_edge"], c["max_edge"],
|
|
||||||
c["count"], c["distinct_rows"], c["row_coverage"] * 100,
|
|
||||||
)
|
|
||||||
|
|
||||||
significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
|
|
||||||
|
|
||||||
for c in significant:
|
|
||||||
logger.info(
|
|
||||||
" significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
|
|
||||||
c["mean_x"], c["min_edge"], c["max_edge"],
|
|
||||||
c["count"], c["distinct_rows"], c["row_coverage"] * 100,
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
"alignment columns: %d clusters, %d primary, %d secondary -> %d significant",
|
|
||||||
len(clusters), len(primary), len(secondary), len(significant),
|
|
||||||
)
|
|
||||||
|
|
||||||
if not significant:
|
|
||||||
# Fallback: single column covering all content
|
|
||||||
x_min = min(w["left"] for w in words)
|
|
||||||
x_max = max(w["left"] + w["width"] for w in words)
|
|
||||||
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
|
|
||||||
|
|
||||||
# --- Merge nearby clusters ---
|
|
||||||
merge_distance = max(25, int(zone_w * 0.03))
|
|
||||||
merged = [significant[0].copy()]
|
|
||||||
for s in significant[1:]:
|
|
||||||
if s["mean_x"] - merged[-1]["mean_x"] < merge_distance:
|
|
||||||
prev = merged[-1]
|
|
||||||
total = prev["count"] + s["count"]
|
|
||||||
prev["mean_x"] = (
|
|
||||||
prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"]
|
|
||||||
) // total
|
|
||||||
prev["count"] = total
|
|
||||||
prev["min_edge"] = min(prev["min_edge"], s["min_edge"])
|
|
||||||
prev["max_edge"] = max(prev["max_edge"], s["max_edge"])
|
|
||||||
prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"])
|
|
||||||
else:
|
|
||||||
merged.append(s.copy())
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"alignment columns: %d after merge (distance=%d)",
|
|
||||||
len(merged), merge_distance,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Build column boundaries ---
|
|
||||||
margin = max(5, int(zone_w * 0.005))
|
|
||||||
content_x_min = min(w["left"] for w in words)
|
|
||||||
content_x_max = max(w["left"] + w["width"] for w in words)
|
|
||||||
|
|
||||||
columns: List[Dict[str, Any]] = []
|
|
||||||
for i, cluster in enumerate(merged):
|
|
||||||
x_min = max(content_x_min, cluster["min_edge"] - margin)
|
|
||||||
if i + 1 < len(merged):
|
|
||||||
x_max = merged[i + 1]["min_edge"] - margin
|
|
||||||
else:
|
|
||||||
x_max = content_x_max
|
|
||||||
|
|
||||||
columns.append({
|
|
||||||
"index": i,
|
|
||||||
"type": f"column_{i + 1}" if len(merged) > 1 else "column_text",
|
|
||||||
"x_min": x_min,
|
|
||||||
"x_max": x_max,
|
|
||||||
})
|
|
||||||
|
|
||||||
return columns
|
|
||||||
|
|
||||||
|
|
||||||
_MARKER_CHARS = set("*-+#>")
|
|
||||||
|
|
||||||
|
|
||||||
def _merge_inline_marker_columns(
|
|
||||||
columns: List[Dict],
|
|
||||||
words: List[Dict],
|
|
||||||
) -> List[Dict]:
|
|
||||||
"""Merge narrow marker columns (bullets, numbering) into adjacent text.
|
|
||||||
|
|
||||||
Bullet points (*, -) and numbering (1., 2.) create narrow columns
|
|
||||||
at the left edge of a zone. These are inline markers that indent text,
|
|
||||||
not real separate columns. Merge them with their right neighbour.
|
|
||||||
|
|
||||||
Does NOT merge columns containing alphabetic words like "to", "in",
|
|
||||||
"der", "die", "das" — those are legitimate content columns.
|
|
||||||
"""
|
|
||||||
if len(columns) < 2:
|
|
||||||
return columns
|
|
||||||
|
|
||||||
merged: List[Dict] = []
|
|
||||||
skip: set = set()
|
|
||||||
|
|
||||||
for i, col in enumerate(columns):
|
|
||||||
if i in skip:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Find words in this column
|
|
||||||
col_words = [
|
|
||||||
w for w in words
|
|
||||||
if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
|
|
||||||
]
|
|
||||||
col_width = col["x_max"] - col["x_min"]
|
|
||||||
|
|
||||||
# Narrow column with mostly short words -> MIGHT be inline markers
|
|
||||||
if col_words and col_width < 80:
|
|
||||||
avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
|
|
||||||
if avg_len <= 2 and i + 1 < len(columns):
|
|
||||||
# Check if words are actual markers (symbols/numbers) vs
|
|
||||||
# real alphabetic words like "to", "in", "der", "die"
|
|
||||||
texts = [(w.get("text") or "").strip() for w in col_words]
|
|
||||||
alpha_count = sum(
|
|
||||||
1 for t in texts
|
|
||||||
if t and t[0].isalpha() and t not in _MARKER_CHARS
|
|
||||||
)
|
|
||||||
alpha_ratio = alpha_count / len(texts) if texts else 0
|
|
||||||
|
|
||||||
# If >=50% of words are alphabetic, this is a real column
|
|
||||||
if alpha_ratio >= 0.5:
|
|
||||||
logger.info(
|
|
||||||
" kept narrow column %d (w=%d, avg_len=%.1f, "
|
|
||||||
"alpha=%.0f%%) -- contains real words",
|
|
||||||
i, col_width, avg_len, alpha_ratio * 100,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Merge into next column
|
|
||||||
next_col = columns[i + 1].copy()
|
|
||||||
next_col["x_min"] = col["x_min"]
|
|
||||||
merged.append(next_col)
|
|
||||||
skip.add(i + 1)
|
|
||||||
logger.info(
|
|
||||||
" merged inline marker column %d (w=%d, avg_len=%.1f) "
|
|
||||||
"into column %d",
|
|
||||||
i, col_width, avg_len, i + 1,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
merged.append(col)
|
|
||||||
|
|
||||||
# Re-index
|
|
||||||
for i, col in enumerate(merged):
|
|
||||||
col["index"] = i
|
|
||||||
col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
|
|
||||||
|
|
||||||
return merged
|
|
||||||
|
|||||||
@@ -1,402 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/filters.py
|
||||||
Grid Editor — word/zone filtering, border ghosts, decorative margins, footers.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Split from grid_editor_helpers.py for maintainability.
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.filters")
|
||||||
All functions are pure computation — no HTTP, DB, or session side effects.
|
|
||||||
|
|
||||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
||||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
|
||||||
"""Remove page-border decoration strip words BEFORE column detection.
|
|
||||||
|
|
||||||
Scans from each page edge inward to find the first significant x-gap
|
|
||||||
(>30 px). If the edge cluster contains <15 % of total words, those
|
|
||||||
words are removed as border-strip artifacts (alphabet letters,
|
|
||||||
illustration fragments).
|
|
||||||
|
|
||||||
Must run BEFORE ``_build_zone_grid`` so that column detection only
|
|
||||||
sees real content words and doesn't produce inflated row counts.
|
|
||||||
"""
|
|
||||||
if len(words) < 10:
|
|
||||||
return words, 0
|
|
||||||
|
|
||||||
sorted_words = sorted(words, key=lambda w: w.get("left", 0))
|
|
||||||
total = len(sorted_words)
|
|
||||||
|
|
||||||
# -- Left-edge scan (running max right-edge) --
|
|
||||||
left_count = 0
|
|
||||||
running_right = 0
|
|
||||||
for gi in range(total - 1):
|
|
||||||
running_right = max(
|
|
||||||
running_right,
|
|
||||||
sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
|
|
||||||
)
|
|
||||||
if sorted_words[gi + 1].get("left", 0) - running_right > 30:
|
|
||||||
left_count = gi + 1
|
|
||||||
break
|
|
||||||
|
|
||||||
# -- Right-edge scan (running min left) --
|
|
||||||
right_count = 0
|
|
||||||
running_left = sorted_words[-1].get("left", 0)
|
|
||||||
for gi in range(total - 1, 0, -1):
|
|
||||||
running_left = min(running_left, sorted_words[gi].get("left", 0))
|
|
||||||
prev_right = (
|
|
||||||
sorted_words[gi - 1].get("left", 0)
|
|
||||||
+ sorted_words[gi - 1].get("width", 0)
|
|
||||||
)
|
|
||||||
if running_left - prev_right > 30:
|
|
||||||
right_count = total - gi
|
|
||||||
break
|
|
||||||
|
|
||||||
# Validate candidate strip: real border decorations are mostly short
|
|
||||||
# words (alphabet letters like "A", "Bb", stray marks). Multi-word
|
|
||||||
# content like "der Ranzen" or "die Schals" (continuation of German
|
|
||||||
# translations) must NOT be removed.
|
|
||||||
def _is_decorative_strip(candidates: List[Dict]) -> bool:
|
|
||||||
if not candidates:
|
|
||||||
return False
|
|
||||||
short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
|
|
||||||
return short / len(candidates) >= 0.45
|
|
||||||
|
|
||||||
strip_ids: set = set()
|
|
||||||
if left_count > 0 and left_count / total < 0.20:
|
|
||||||
candidates = sorted_words[:left_count]
|
|
||||||
if _is_decorative_strip(candidates):
|
|
||||||
strip_ids = {id(w) for w in candidates}
|
|
||||||
elif right_count > 0 and right_count / total < 0.20:
|
|
||||||
candidates = sorted_words[total - right_count:]
|
|
||||||
if _is_decorative_strip(candidates):
|
|
||||||
strip_ids = {id(w) for w in candidates}
|
|
||||||
|
|
||||||
if not strip_ids:
|
|
||||||
return words, 0
|
|
||||||
|
|
||||||
return [w for w in words if id(w) not in strip_ids], len(strip_ids)
|
|
||||||
|
|
||||||
|
|
||||||
# Characters that are typically OCR artefacts from box border lines.
|
|
||||||
# Intentionally excludes ! (red markers) and . , ; (real punctuation).
|
|
||||||
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+")
|
|
||||||
|
|
||||||
|
|
||||||
def _filter_border_ghosts(
|
|
||||||
words: List[Dict],
|
|
||||||
boxes: List,
|
|
||||||
) -> tuple:
|
|
||||||
"""Remove words sitting on box borders that are OCR artefacts.
|
|
||||||
|
|
||||||
Returns (filtered_words, removed_count).
|
|
||||||
"""
|
|
||||||
if not boxes or not words:
|
|
||||||
return words, 0
|
|
||||||
|
|
||||||
# Build border bands from detected boxes
|
|
||||||
x_bands: List[tuple] = []
|
|
||||||
y_bands: List[tuple] = []
|
|
||||||
for b in boxes:
|
|
||||||
bt = (
|
|
||||||
b.border_thickness
|
|
||||||
if hasattr(b, "border_thickness")
|
|
||||||
else b.get("border_thickness", 3)
|
|
||||||
)
|
|
||||||
# Skip borderless boxes (images/graphics) -- no border line to produce ghosts
|
|
||||||
if bt == 0:
|
|
||||||
continue
|
|
||||||
bx = b.x if hasattr(b, "x") else b.get("x", 0)
|
|
||||||
by = b.y if hasattr(b, "y") else b.get("y", 0)
|
|
||||||
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
|
|
||||||
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
|
|
||||||
margin = max(bt * 2, 10) + 6
|
|
||||||
x_bands.append((bx - margin, bx + margin))
|
|
||||||
x_bands.append((bx + bw - margin, bx + bw + margin))
|
|
||||||
y_bands.append((by - margin, by + margin))
|
|
||||||
y_bands.append((by + bh - margin, by + bh + margin))
|
|
||||||
|
|
||||||
def _is_ghost(w: Dict) -> bool:
|
|
||||||
text = (w.get("text") or "").strip()
|
|
||||||
if not text:
|
|
||||||
return False
|
|
||||||
# Check if any word edge (not just center) touches a border band
|
|
||||||
w_left = w["left"]
|
|
||||||
w_right = w["left"] + w["width"]
|
|
||||||
w_top = w["top"]
|
|
||||||
w_bottom = w["top"] + w["height"]
|
|
||||||
on_border = (
|
|
||||||
any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
|
|
||||||
or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
|
|
||||||
)
|
|
||||||
if not on_border:
|
|
||||||
return False
|
|
||||||
if len(text) == 1 and text in _GRID_GHOST_CHARS:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
filtered = [w for w in words if not _is_ghost(w)]
|
|
||||||
return filtered, len(words) - len(filtered)
|
|
||||||
|
|
||||||
|
|
||||||
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
|
|
||||||
"""Extract all word_boxes from cells into a flat list of word dicts."""
|
|
||||||
words: List[Dict] = []
|
|
||||||
for cell in cells:
|
|
||||||
for wb in cell.get("word_boxes") or []:
|
|
||||||
if wb.get("text", "").strip():
|
|
||||||
words.append({
|
|
||||||
"text": wb["text"],
|
|
||||||
"left": wb["left"],
|
|
||||||
"top": wb["top"],
|
|
||||||
"width": wb["width"],
|
|
||||||
"height": wb["height"],
|
|
||||||
"conf": wb.get("conf", 0),
|
|
||||||
})
|
|
||||||
return words
|
|
||||||
|
|
||||||
|
|
||||||
def _words_in_zone(
|
|
||||||
words: List[Dict],
|
|
||||||
zone_y: int,
|
|
||||||
zone_h: int,
|
|
||||||
zone_x: int,
|
|
||||||
zone_w: int,
|
|
||||||
) -> List[Dict]:
|
|
||||||
"""Filter words whose Y-center falls within a zone's bounds."""
|
|
||||||
zone_y_end = zone_y + zone_h
|
|
||||||
zone_x_end = zone_x + zone_w
|
|
||||||
result = []
|
|
||||||
for w in words:
|
|
||||||
cy = w["top"] + w["height"] / 2
|
|
||||||
cx = w["left"] + w["width"] / 2
|
|
||||||
if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end:
|
|
||||||
result.append(w)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def _get_content_bounds(words: List[Dict]) -> tuple:
|
|
||||||
"""Get content bounds from word positions."""
|
|
||||||
if not words:
|
|
||||||
return 0, 0, 0, 0
|
|
||||||
x_min = min(w["left"] for w in words)
|
|
||||||
y_min = min(w["top"] for w in words)
|
|
||||||
x_max = max(w["left"] + w["width"] for w in words)
|
|
||||||
y_max = max(w["top"] + w["height"] for w in words)
|
|
||||||
return x_min, y_min, x_max - x_min, y_max - y_min
|
|
||||||
|
|
||||||
|
|
||||||
def _filter_decorative_margin(
|
|
||||||
words: List[Dict],
|
|
||||||
img_w: int,
|
|
||||||
log: Any,
|
|
||||||
session_id: str,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Remove words that belong to a decorative alphabet strip on a margin.
|
|
||||||
|
|
||||||
Some vocabulary worksheets have a vertical A-Z alphabet graphic along
|
|
||||||
the left or right edge. OCR reads each letter as an isolated single-
|
|
||||||
character word. These decorative elements are not content and confuse
|
|
||||||
column/row detection.
|
|
||||||
|
|
||||||
Detection criteria (phase 1 -- find the strip using single-char words):
|
|
||||||
- Words are in the outer 30% of the page (left or right)
|
|
||||||
- Nearly all words are single characters (letters or digits)
|
|
||||||
- At least 8 such words form a vertical strip (>=8 unique Y positions)
|
|
||||||
- Average horizontal spread of the strip is small (< 80px)
|
|
||||||
|
|
||||||
Phase 2 -- once a strip is confirmed, also remove any short word (<=3
|
|
||||||
chars) in the same narrow x-range. This catches multi-char OCR
|
|
||||||
artifacts like "Vv" that belong to the same decorative element.
|
|
||||||
|
|
||||||
Modifies *words* in place.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
|
|
||||||
"""
|
|
||||||
no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
|
|
||||||
if not words or img_w <= 0:
|
|
||||||
return no_strip
|
|
||||||
|
|
||||||
margin_cutoff = img_w * 0.30
|
|
||||||
# Phase 1: find candidate strips using short words (1-2 chars).
|
|
||||||
# OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
|
|
||||||
# rather than singles, so accept <=2-char words as strip candidates.
|
|
||||||
left_strip = [
|
|
||||||
w for w in words
|
|
||||||
if len((w.get("text") or "").strip()) <= 2
|
|
||||||
and w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
|
||||||
]
|
|
||||||
right_strip = [
|
|
||||||
w for w in words
|
|
||||||
if len((w.get("text") or "").strip()) <= 2
|
|
||||||
and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
|
|
||||||
]
|
|
||||||
|
|
||||||
for strip, side in [(left_strip, "left"), (right_strip, "right")]:
|
|
||||||
if len(strip) < 6:
|
|
||||||
continue
|
|
||||||
# Check vertical distribution: should have many distinct Y positions
|
|
||||||
y_centers = sorted(set(
|
|
||||||
int(w["top"] + w.get("height", 0) / 2) // 20 * 20 # bucket
|
|
||||||
for w in strip
|
|
||||||
))
|
|
||||||
if len(y_centers) < 6:
|
|
||||||
continue
|
|
||||||
# Check horizontal compactness
|
|
||||||
x_positions = [w["left"] for w in strip]
|
|
||||||
x_min = min(x_positions)
|
|
||||||
x_max = max(x_positions)
|
|
||||||
x_spread = x_max - x_min
|
|
||||||
if x_spread > 80:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Phase 2: strip confirmed -- also collect short words in same x-range
|
|
||||||
# Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
|
|
||||||
strip_x_lo = x_min - 20
|
|
||||||
strip_x_hi = x_max + 60 # word width + tolerance
|
|
||||||
all_strip_words = [
|
|
||||||
w for w in words
|
|
||||||
if len((w.get("text") or "").strip()) <= 3
|
|
||||||
and strip_x_lo <= w["left"] <= strip_x_hi
|
|
||||||
and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
|
||||||
if side == "left"
|
|
||||||
else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
|
|
||||||
]
|
|
||||||
|
|
||||||
strip_set = set(id(w) for w in all_strip_words)
|
|
||||||
before = len(words)
|
|
||||||
words[:] = [w for w in words if id(w) not in strip_set]
|
|
||||||
removed = before - len(words)
|
|
||||||
if removed:
|
|
||||||
log.info(
|
|
||||||
"build-grid session %s: removed %d decorative %s-margin words "
|
|
||||||
"(strip x=%d-%d)",
|
|
||||||
session_id, removed, side, strip_x_lo, strip_x_hi,
|
|
||||||
)
|
|
||||||
return {"found": True, "side": side, "letters_detected": len(strip)}
|
|
||||||
|
|
||||||
return no_strip
|
|
||||||
|
|
||||||
|
|
||||||
def _filter_footer_words(
|
|
||||||
words: List[Dict],
|
|
||||||
img_h: int,
|
|
||||||
log: Any,
|
|
||||||
session_id: str,
|
|
||||||
) -> Optional[Dict]:
|
|
||||||
"""Remove isolated words in the bottom 5% of the page (page numbers).
|
|
||||||
|
|
||||||
Modifies *words* in place and returns a page_number metadata dict
|
|
||||||
if a page number was extracted, or None.
|
|
||||||
"""
|
|
||||||
if not words or img_h <= 0:
|
|
||||||
return None
|
|
||||||
footer_y = img_h * 0.95
|
|
||||||
footer_words = [
|
|
||||||
w for w in words
|
|
||||||
if w["top"] + w.get("height", 0) / 2 > footer_y
|
|
||||||
]
|
|
||||||
if not footer_words:
|
|
||||||
return None
|
|
||||||
# Only remove if footer has very few words (<= 3) with short text
|
|
||||||
total_text = "".join((w.get("text") or "").strip() for w in footer_words)
|
|
||||||
if len(footer_words) <= 3 and len(total_text) <= 10:
|
|
||||||
# Extract page number metadata before removing
|
|
||||||
page_number_info = {
|
|
||||||
"text": total_text.strip(),
|
|
||||||
"y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
|
|
||||||
}
|
|
||||||
# Try to parse as integer
|
|
||||||
digits = "".join(c for c in total_text if c.isdigit())
|
|
||||||
if digits:
|
|
||||||
page_number_info["number"] = int(digits)
|
|
||||||
|
|
||||||
footer_set = set(id(w) for w in footer_words)
|
|
||||||
words[:] = [w for w in words if id(w) not in footer_set]
|
|
||||||
log.info(
|
|
||||||
"build-grid session %s: extracted page number '%s' and removed %d footer words",
|
|
||||||
session_id, total_text, len(footer_words),
|
|
||||||
)
|
|
||||||
return page_number_info
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _filter_header_junk(
|
|
||||||
words: List[Dict],
|
|
||||||
img_h: int,
|
|
||||||
log: Any,
|
|
||||||
session_id: str,
|
|
||||||
) -> None:
|
|
||||||
"""Remove OCR junk from header illustrations above the real content.
|
|
||||||
|
|
||||||
Textbook pages often have decorative header graphics (illustrations,
|
|
||||||
icons) that OCR reads as low-confidence junk characters. Real content
|
|
||||||
typically starts further down the page.
|
|
||||||
|
|
||||||
Algorithm:
|
|
||||||
1. Find the "content start" -- the first Y position where a dense
|
|
||||||
horizontal row of 3+ high-confidence words begins.
|
|
||||||
2. Above that line, remove words with conf < 75 and text <= 3 chars.
|
|
||||||
These are almost certainly OCR artifacts from illustrations.
|
|
||||||
|
|
||||||
Modifies *words* in place.
|
|
||||||
"""
|
|
||||||
if not words or img_h <= 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
# --- Find content start: first horizontal row with >=3 high-conf words ---
|
|
||||||
# Sort words by Y
|
|
||||||
sorted_by_y = sorted(words, key=lambda w: w["top"])
|
|
||||||
content_start_y = 0
|
|
||||||
_ROW_TOLERANCE = img_h * 0.02 # words within 2% of page height = same row
|
|
||||||
_MIN_ROW_WORDS = 3
|
|
||||||
_MIN_CONF = 80
|
|
||||||
|
|
||||||
i = 0
|
|
||||||
while i < len(sorted_by_y):
|
|
||||||
row_y = sorted_by_y[i]["top"]
|
|
||||||
# Collect words in this row band
|
|
||||||
row_words = []
|
|
||||||
j = i
|
|
||||||
while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
|
|
||||||
row_words.append(sorted_by_y[j])
|
|
||||||
j += 1
|
|
||||||
# Count high-confidence words with real text (> 1 char)
|
|
||||||
high_conf = [
|
|
||||||
w for w in row_words
|
|
||||||
if w.get("conf", 0) >= _MIN_CONF
|
|
||||||
and len((w.get("text") or "").strip()) > 1
|
|
||||||
]
|
|
||||||
if len(high_conf) >= _MIN_ROW_WORDS:
|
|
||||||
content_start_y = row_y
|
|
||||||
break
|
|
||||||
i = j if j > i else i + 1
|
|
||||||
|
|
||||||
if content_start_y <= 0:
|
|
||||||
return # no clear content start found
|
|
||||||
|
|
||||||
# --- Remove low-conf short junk above content start ---
|
|
||||||
junk = [
|
|
||||||
w for w in words
|
|
||||||
if w["top"] + w.get("height", 0) < content_start_y
|
|
||||||
and w.get("conf", 0) < 75
|
|
||||||
and len((w.get("text") or "").strip()) <= 3
|
|
||||||
]
|
|
||||||
if not junk:
|
|
||||||
return
|
|
||||||
|
|
||||||
junk_set = set(id(w) for w in junk)
|
|
||||||
before = len(words)
|
|
||||||
words[:] = [w for w in words if id(w) not in junk_set]
|
|
||||||
removed = before - len(words)
|
|
||||||
if removed:
|
|
||||||
log.info(
|
|
||||||
"build-grid session %s: removed %d header junk words above y=%d "
|
|
||||||
"(content start)",
|
|
||||||
session_id, removed, content_start_y,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,499 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/headers.py
|
||||||
Grid Editor — header/heading detection and colspan (merged cell) detection.
|
import importlib as _importlib
|
||||||
Split from grid_editor_helpers.py. Pure computation, no HTTP/DB side effects.
|
import sys as _sys
|
||||||
Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.headers")
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from cv_ocr_engines import _text_has_garbled_ipa
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
|
|
||||||
"""Detect heading rows by color + height after color annotation.
|
|
||||||
|
|
||||||
A row is a heading if:
|
|
||||||
1. ALL word_boxes have color_name != 'black' (typically 'blue')
|
|
||||||
2. Mean word height > 1.2x median height of all words in the zone
|
|
||||||
|
|
||||||
Detected heading rows are merged into a single spanning cell.
|
|
||||||
Returns count of headings detected.
|
|
||||||
"""
|
|
||||||
heading_count = 0
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
rows = z.get("rows", [])
|
|
||||||
columns = z.get("columns", [])
|
|
||||||
if not cells or not rows or len(columns) < 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Compute median word height across the zone
|
|
||||||
all_heights = []
|
|
||||||
for cell in cells:
|
|
||||||
for wb in cell.get("word_boxes") or []:
|
|
||||||
h = wb.get("height", 0)
|
|
||||||
if h > 0:
|
|
||||||
all_heights.append(h)
|
|
||||||
if not all_heights:
|
|
||||||
continue
|
|
||||||
all_heights_sorted = sorted(all_heights)
|
|
||||||
median_h = all_heights_sorted[len(all_heights_sorted) // 2]
|
|
||||||
|
|
||||||
heading_row_indices = []
|
|
||||||
for row in rows:
|
|
||||||
if row.get("is_header"):
|
|
||||||
continue # already detected as header
|
|
||||||
ri = row["index"]
|
|
||||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
||||||
row_wbs = [
|
|
||||||
wb for cell in row_cells
|
|
||||||
for wb in cell.get("word_boxes") or []
|
|
||||||
]
|
|
||||||
if not row_wbs:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Condition 1: ALL words are non-black
|
|
||||||
all_colored = all(
|
|
||||||
wb.get("color_name", "black") != "black"
|
|
||||||
for wb in row_wbs
|
|
||||||
)
|
|
||||||
if not all_colored:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Condition 2: mean height > 1.2x median
|
|
||||||
mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
|
|
||||||
if mean_h <= median_h * 1.2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
heading_row_indices.append(ri)
|
|
||||||
|
|
||||||
# Merge heading cells into spanning cells
|
|
||||||
for hri in heading_row_indices:
|
|
||||||
header_cells = [c for c in cells if c.get("row_index") == hri]
|
|
||||||
if len(header_cells) <= 1:
|
|
||||||
# Single cell -- just mark it as heading
|
|
||||||
if header_cells:
|
|
||||||
header_cells[0]["col_type"] = "heading"
|
|
||||||
heading_count += 1
|
|
||||||
# Mark row as header
|
|
||||||
for row in rows:
|
|
||||||
if row["index"] == hri:
|
|
||||||
row["is_header"] = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Collect all word_boxes and text from all columns
|
|
||||||
all_wb = []
|
|
||||||
all_text_parts = []
|
|
||||||
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
|
||||||
all_wb.extend(hc.get("word_boxes", []))
|
|
||||||
if hc.get("text", "").strip():
|
|
||||||
all_text_parts.append(hc["text"].strip())
|
|
||||||
|
|
||||||
# Remove all cells for this row, replace with one spanning cell
|
|
||||||
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
|
|
||||||
|
|
||||||
if all_wb:
|
|
||||||
x_min = min(wb["left"] for wb in all_wb)
|
|
||||||
y_min = min(wb["top"] for wb in all_wb)
|
|
||||||
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
||||||
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
||||||
|
|
||||||
# Use the actual starting col_index from the first cell
|
|
||||||
first_col = min(hc["col_index"] for hc in header_cells)
|
|
||||||
zone_idx = z.get("zone_index", 0)
|
|
||||||
z["cells"].append({
|
|
||||||
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
|
|
||||||
"zone_index": zone_idx,
|
|
||||||
"row_index": hri,
|
|
||||||
"col_index": first_col,
|
|
||||||
"col_type": "heading",
|
|
||||||
"text": " ".join(all_text_parts),
|
|
||||||
"confidence": 0.0,
|
|
||||||
"bbox_px": {"x": x_min, "y": y_min,
|
|
||||||
"w": x_max - x_min, "h": y_max - y_min},
|
|
||||||
"bbox_pct": {
|
|
||||||
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
||||||
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
||||||
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
||||||
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
||||||
},
|
|
||||||
"word_boxes": all_wb,
|
|
||||||
"ocr_engine": "words_first",
|
|
||||||
"is_bold": True,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Mark row as header
|
|
||||||
for row in rows:
|
|
||||||
if row["index"] == hri:
|
|
||||||
row["is_header"] = True
|
|
||||||
heading_count += 1
|
|
||||||
|
|
||||||
return heading_count
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_heading_rows_by_single_cell(
|
|
||||||
zones_data: List[Dict], img_w: int, img_h: int,
|
|
||||||
) -> int:
|
|
||||||
"""Detect heading rows that have only a single content cell.
|
|
||||||
|
|
||||||
Black headings like "Theme" have normal color and height, so they are
|
|
||||||
missed by ``_detect_heading_rows_by_color``. The distinguishing signal
|
|
||||||
is that they occupy only one column while normal vocabulary rows fill
|
|
||||||
at least 2-3 columns.
|
|
||||||
|
|
||||||
A row qualifies as a heading if:
|
|
||||||
1. It is not already marked as a header/heading.
|
|
||||||
2. It has exactly ONE cell whose col_type starts with ``column_``
|
|
||||||
(excluding column_1 / page_ref which only carries page numbers).
|
|
||||||
3. That single cell is NOT in the last column (continuation/example
|
|
||||||
lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
|
|
||||||
4. The text does not start with ``[`` (IPA continuation).
|
|
||||||
5. The zone has >=3 columns and >=5 rows (avoids false positives in
|
|
||||||
tiny zones).
|
|
||||||
6. The majority of rows in the zone have >=2 content cells (ensures
|
|
||||||
we are in a multi-column vocab layout).
|
|
||||||
"""
|
|
||||||
heading_count = 0
|
|
||||||
|
|
||||||
for z in zones_data:
|
|
||||||
cells = z.get("cells", [])
|
|
||||||
rows = z.get("rows", [])
|
|
||||||
columns = z.get("columns", [])
|
|
||||||
if len(columns) < 3 or len(rows) < 5:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Determine the last col_index (example/sentence column)
|
|
||||||
col_indices = sorted(set(c.get("col_index", 0) for c in cells))
|
|
||||||
if not col_indices:
|
|
||||||
continue
|
|
||||||
last_col = col_indices[-1]
|
|
||||||
|
|
||||||
# Count content cells per row (column_* but not column_1/page_ref).
|
|
||||||
# Exception: column_1 cells that contain a dictionary article word
|
|
||||||
# (die/der/das etc.) ARE content -- they appear in dictionary layouts
|
|
||||||
# where the leftmost column holds grammatical articles.
|
|
||||||
_ARTICLE_WORDS = {
|
|
||||||
"die", "der", "das", "dem", "den", "des", "ein", "eine",
|
|
||||||
"the", "a", "an",
|
|
||||||
}
|
|
||||||
row_content_counts: Dict[int, int] = {}
|
|
||||||
for cell in cells:
|
|
||||||
ct = cell.get("col_type", "")
|
|
||||||
if not ct.startswith("column_"):
|
|
||||||
continue
|
|
||||||
if ct == "column_1":
|
|
||||||
ctext = (cell.get("text") or "").strip().lower()
|
|
||||||
if ctext not in _ARTICLE_WORDS:
|
|
||||||
continue
|
|
||||||
ri = cell.get("row_index", -1)
|
|
||||||
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
|
|
||||||
|
|
||||||
# Majority of rows must have >=2 content cells
|
|
||||||
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
|
|
||||||
if multi_col_rows < len(rows) * 0.4:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Exclude first and last non-header rows -- these are typically
|
|
||||||
# page numbers or footer text, not headings.
|
|
||||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
|
||||||
if len(non_header_rows) < 3:
|
|
||||||
continue
|
|
||||||
first_ri = non_header_rows[0]["index"]
|
|
||||||
last_ri = non_header_rows[-1]["index"]
|
|
||||||
|
|
||||||
heading_row_indices = []
|
|
||||||
for row in rows:
|
|
||||||
if row.get("is_header"):
|
|
||||||
continue
|
|
||||||
ri = row["index"]
|
|
||||||
if ri == first_ri or ri == last_ri:
|
|
||||||
continue
|
|
||||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
||||||
content_cells = [
|
|
||||||
c for c in row_cells
|
|
||||||
if c.get("col_type", "").startswith("column_")
|
|
||||||
and (c.get("col_type") != "column_1"
|
|
||||||
or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
|
|
||||||
]
|
|
||||||
if len(content_cells) != 1:
|
|
||||||
continue
|
|
||||||
cell = content_cells[0]
|
|
||||||
# Not in the last column (continuation/example lines)
|
|
||||||
if cell.get("col_index") == last_col:
|
|
||||||
continue
|
|
||||||
text = (cell.get("text") or "").strip()
|
|
||||||
if not text or text.startswith("["):
|
|
||||||
continue
|
|
||||||
# Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
|
|
||||||
if text.startswith("("):
|
|
||||||
continue
|
|
||||||
# Single cell NOT in the first content column is likely a
|
|
||||||
# continuation/overflow line, not a heading. Real headings
|
|
||||||
# ("Theme 1", "Unit 3: ...") appear in the first or second
|
|
||||||
# content column.
|
|
||||||
first_content_col = col_indices[0] if col_indices else 0
|
|
||||||
if cell.get("col_index", 0) > first_content_col + 1:
|
|
||||||
continue
|
|
||||||
# Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
|
|
||||||
# but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
|
|
||||||
_REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
|
|
||||||
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
|
|
||||||
continue
|
|
||||||
# Guard: dictionary section headings are short (1-4 alpha chars
|
|
||||||
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
|
|
||||||
# lowercase is a regular vocabulary word (e.g. "zentral") that
|
|
||||||
# happens to appear alone in its row.
|
|
||||||
alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
|
|
||||||
if len(alpha_only) > 4 and text[0].islower():
|
|
||||||
continue
|
|
||||||
heading_row_indices.append(ri)
|
|
||||||
|
|
||||||
# Guard: if >25% of eligible rows would become headings, the
|
|
||||||
# heuristic is misfiring (e.g. sparse single-column layout where
|
|
||||||
# most rows naturally have only 1 content cell).
|
|
||||||
eligible_rows = len(non_header_rows) - 2 # minus first/last excluded
|
|
||||||
if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
|
|
||||||
logger.debug(
|
|
||||||
"Skipping single-cell heading detection for zone %s: "
|
|
||||||
"%d/%d rows would be headings (>25%%)",
|
|
||||||
z.get("zone_index"), len(heading_row_indices), eligible_rows,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
for hri in heading_row_indices:
|
|
||||||
header_cells = [c for c in cells if c.get("row_index") == hri]
|
|
||||||
if not header_cells:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Collect all word_boxes and text
|
|
||||||
all_wb = []
|
|
||||||
all_text_parts = []
|
|
||||||
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
|
||||||
all_wb.extend(hc.get("word_boxes", []))
|
|
||||||
if hc.get("text", "").strip():
|
|
||||||
all_text_parts.append(hc["text"].strip())
|
|
||||||
|
|
||||||
first_col_idx = min(hc["col_index"] for hc in header_cells)
|
|
||||||
|
|
||||||
# Remove old cells for this row, add spanning heading cell
|
|
||||||
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
|
|
||||||
|
|
||||||
if all_wb:
|
|
||||||
x_min = min(wb["left"] for wb in all_wb)
|
|
||||||
y_min = min(wb["top"] for wb in all_wb)
|
|
||||||
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
||||||
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
||||||
else:
|
|
||||||
# Fallback to first cell bbox
|
|
||||||
bp = header_cells[0].get("bbox_px", {})
|
|
||||||
x_min = bp.get("x", 0)
|
|
||||||
y_min = bp.get("y", 0)
|
|
||||||
x_max = x_min + bp.get("w", 0)
|
|
||||||
y_max = y_min + bp.get("h", 0)
|
|
||||||
|
|
||||||
zone_idx = z.get("zone_index", 0)
|
|
||||||
z["cells"].append({
|
|
||||||
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
|
|
||||||
"zone_index": zone_idx,
|
|
||||||
"row_index": hri,
|
|
||||||
"col_index": first_col_idx,
|
|
||||||
"col_type": "heading",
|
|
||||||
"text": " ".join(all_text_parts),
|
|
||||||
"confidence": 0.0,
|
|
||||||
"bbox_px": {"x": x_min, "y": y_min,
|
|
||||||
"w": x_max - x_min, "h": y_max - y_min},
|
|
||||||
"bbox_pct": {
|
|
||||||
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
||||||
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
||||||
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
||||||
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
||||||
},
|
|
||||||
"word_boxes": all_wb,
|
|
||||||
"ocr_engine": "words_first",
|
|
||||||
"is_bold": False,
|
|
||||||
})
|
|
||||||
|
|
||||||
for row in rows:
|
|
||||||
if row["index"] == hri:
|
|
||||||
row["is_header"] = True
|
|
||||||
heading_count += 1
|
|
||||||
|
|
||||||
return heading_count
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_header_rows(
|
|
||||||
rows: List[Dict],
|
|
||||||
zone_words: List[Dict],
|
|
||||||
zone_y: int,
|
|
||||||
columns: Optional[List[Dict]] = None,
|
|
||||||
skip_first_row_header: bool = False,
|
|
||||||
) -> List[int]:
|
|
||||||
"""Detect header rows: first-row heuristic + spanning header detection.
|
|
||||||
|
|
||||||
A "spanning header" is a row whose words stretch across multiple column
|
|
||||||
boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
|
|
||||||
"""
|
|
||||||
if len(rows) < 2:
|
|
||||||
return []
|
|
||||||
|
|
||||||
headers = []
|
|
||||||
|
|
||||||
if not skip_first_row_header:
|
|
||||||
first_row = rows[0]
|
|
||||||
second_row = rows[1]
|
|
||||||
|
|
||||||
# Gap between first and second row > 0.5x average row height
|
|
||||||
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
|
||||||
gap = second_row["y_min"] - first_row["y_max"]
|
|
||||||
if gap > avg_h * 0.5:
|
|
||||||
headers.append(0)
|
|
||||||
|
|
||||||
# Also check if first row words are taller than average (bold/header text)
|
|
||||||
all_heights = [w["height"] for w in zone_words]
|
|
||||||
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
|
||||||
first_row_words = [
|
|
||||||
w for w in zone_words
|
|
||||||
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
|
||||||
]
|
|
||||||
if first_row_words:
|
|
||||||
first_h = max(w["height"] for w in first_row_words)
|
|
||||||
if first_h > median_h * 1.3:
|
|
||||||
if 0 not in headers:
|
|
||||||
headers.append(0)
|
|
||||||
|
|
||||||
# Note: Spanning-header detection (rows spanning all columns) has been
|
|
||||||
# disabled because it produces too many false positives on vocabulary
|
|
||||||
# worksheets where IPA transcriptions or short entries naturally span
|
|
||||||
# multiple columns with few words. The first-row heuristic above is
|
|
||||||
# sufficient for detecting real headers.
|
|
||||||
|
|
||||||
return headers
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_colspan_cells(
|
|
||||||
zone_words: List[Dict],
|
|
||||||
columns: List[Dict],
|
|
||||||
rows: List[Dict],
|
|
||||||
cells: List[Dict],
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
) -> List[Dict]:
|
|
||||||
"""Detect and merge cells that span multiple columns (colspan).
|
|
||||||
|
|
||||||
A word-block (PaddleOCR phrase) that extends significantly past a column
|
|
||||||
boundary into the next column indicates a merged cell. This replaces
|
|
||||||
the incorrectly split cells with a single cell spanning multiple columns.
|
|
||||||
|
|
||||||
Works for both full-page scans and box zones.
|
|
||||||
"""
|
|
||||||
if len(columns) < 2 or not zone_words or not rows:
|
|
||||||
return cells
|
|
||||||
|
|
||||||
from cv_words_first import _assign_word_to_row
|
|
||||||
|
|
||||||
# Column boundaries (midpoints between adjacent columns)
|
|
||||||
col_boundaries = []
|
|
||||||
for ci in range(len(columns) - 1):
|
|
||||||
col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
|
|
||||||
|
|
||||||
def _cols_covered(w_left: float, w_right: float) -> List[int]:
|
|
||||||
"""Return list of column indices that a word-block covers."""
|
|
||||||
covered = []
|
|
||||||
for col in columns:
|
|
||||||
col_mid = (col["x_min"] + col["x_max"]) / 2
|
|
||||||
# Word covers a column if it extends past the column's midpoint
|
|
||||||
if w_left < col_mid < w_right:
|
|
||||||
covered.append(col["index"])
|
|
||||||
# Also include column if word starts within it
|
|
||||||
elif col["x_min"] <= w_left < col["x_max"]:
|
|
||||||
covered.append(col["index"])
|
|
||||||
return sorted(set(covered))
|
|
||||||
|
|
||||||
# Group original word-blocks by row
|
|
||||||
row_word_blocks: Dict[int, List[Dict]] = {}
|
|
||||||
for w in zone_words:
|
|
||||||
ri = _assign_word_to_row(w, rows)
|
|
||||||
row_word_blocks.setdefault(ri, []).append(w)
|
|
||||||
|
|
||||||
# For each row, check if any word-block spans multiple columns
|
|
||||||
rows_to_merge: Dict[int, List[Dict]] = {} # row_index -> list of spanning word-blocks
|
|
||||||
|
|
||||||
for ri, wblocks in row_word_blocks.items():
|
|
||||||
spanning = []
|
|
||||||
for w in wblocks:
|
|
||||||
w_left = w["left"]
|
|
||||||
w_right = w_left + w["width"]
|
|
||||||
covered = _cols_covered(w_left, w_right)
|
|
||||||
if len(covered) >= 2:
|
|
||||||
spanning.append({"word": w, "cols": covered})
|
|
||||||
if spanning:
|
|
||||||
rows_to_merge[ri] = spanning
|
|
||||||
|
|
||||||
if not rows_to_merge:
|
|
||||||
return cells
|
|
||||||
|
|
||||||
# Merge cells for spanning rows
|
|
||||||
new_cells = []
|
|
||||||
for cell in cells:
|
|
||||||
ri = cell.get("row_index", -1)
|
|
||||||
if ri not in rows_to_merge:
|
|
||||||
new_cells.append(cell)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if this cell's column is part of a spanning block
|
|
||||||
ci = cell.get("col_index", -1)
|
|
||||||
is_part_of_span = False
|
|
||||||
for span in rows_to_merge[ri]:
|
|
||||||
if ci in span["cols"]:
|
|
||||||
is_part_of_span = True
|
|
||||||
# Only emit the merged cell for the FIRST column in the span
|
|
||||||
if ci == span["cols"][0]:
|
|
||||||
# Use the ORIGINAL word-block text (not the split cell texts
|
|
||||||
# which may have broken words like "euros a" + "nd cents")
|
|
||||||
orig_word = span["word"]
|
|
||||||
merged_text = orig_word.get("text", "").strip()
|
|
||||||
all_wb = [orig_word]
|
|
||||||
|
|
||||||
# Compute merged bbox
|
|
||||||
if all_wb:
|
|
||||||
x_min = min(wb["left"] for wb in all_wb)
|
|
||||||
y_min = min(wb["top"] for wb in all_wb)
|
|
||||||
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
||||||
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
||||||
else:
|
|
||||||
x_min = y_min = x_max = y_max = 0
|
|
||||||
|
|
||||||
new_cells.append({
|
|
||||||
"cell_id": cell["cell_id"],
|
|
||||||
"row_index": ri,
|
|
||||||
"col_index": span["cols"][0],
|
|
||||||
"col_type": "spanning_header",
|
|
||||||
"colspan": len(span["cols"]),
|
|
||||||
"text": merged_text,
|
|
||||||
"confidence": cell.get("confidence", 0),
|
|
||||||
"bbox_px": {"x": x_min, "y": y_min,
|
|
||||||
"w": x_max - x_min, "h": y_max - y_min},
|
|
||||||
"bbox_pct": {
|
|
||||||
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
||||||
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
||||||
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
||||||
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
||||||
},
|
|
||||||
"word_boxes": all_wb,
|
|
||||||
"ocr_engine": cell.get("ocr_engine", ""),
|
|
||||||
"is_bold": cell.get("is_bold", False),
|
|
||||||
})
|
|
||||||
logger.info(
|
|
||||||
"colspan detected: row %d, cols %s -> merged %d cells (%r)",
|
|
||||||
ri, span["cols"], len(span["cols"]), merged_text[:50],
|
|
||||||
)
|
|
||||||
break
|
|
||||||
if not is_part_of_span:
|
|
||||||
new_cells.append(cell)
|
|
||||||
|
|
||||||
return new_cells
|
|
||||||
|
|||||||
@@ -1,58 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/helpers.py
|
||||||
Grid Editor helper functions — barrel re-export module.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
This file re-exports all public symbols from the split sub-modules
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.helpers")
|
||||||
so that existing ``from grid_editor_helpers import ...`` statements
|
|
||||||
continue to work without changes.
|
|
||||||
|
|
||||||
Sub-modules:
|
|
||||||
- grid_editor_columns — column detection, cross-column splitting, marker merging
|
|
||||||
- grid_editor_filters — word/zone filtering, border ghosts, decorative margins
|
|
||||||
- grid_editor_headers — header/heading detection, colspan detection
|
|
||||||
- grid_editor_zones — vertical dividers, zone splitting/merging, zone grid building
|
|
||||||
|
|
||||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
||||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# --- Re-export: columns ---------------------------------------------------
|
|
||||||
from grid_editor_columns import ( # noqa: F401
|
|
||||||
_is_recognized_word,
|
|
||||||
_split_cross_column_words,
|
|
||||||
_cluster_columns_by_alignment,
|
|
||||||
_MARKER_CHARS,
|
|
||||||
_merge_inline_marker_columns,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Re-export: filters ----------------------------------------------------
|
|
||||||
from grid_editor_filters import ( # noqa: F401
|
|
||||||
_filter_border_strip_words,
|
|
||||||
_GRID_GHOST_CHARS,
|
|
||||||
_filter_border_ghosts,
|
|
||||||
_flatten_word_boxes,
|
|
||||||
_words_in_zone,
|
|
||||||
_get_content_bounds,
|
|
||||||
_filter_decorative_margin,
|
|
||||||
_filter_footer_words,
|
|
||||||
_filter_header_junk,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Re-export: headers ----------------------------------------------------
|
|
||||||
from grid_editor_headers import ( # noqa: F401
|
|
||||||
_detect_heading_rows_by_color,
|
|
||||||
_detect_heading_rows_by_single_cell,
|
|
||||||
_detect_header_rows,
|
|
||||||
_detect_colspan_cells,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Re-export: zones -------------------------------------------------------
|
|
||||||
from grid_editor_zones import ( # noqa: F401
|
|
||||||
_PIPE_RE_VSPLIT,
|
|
||||||
_detect_vertical_dividers,
|
|
||||||
_split_zone_at_vertical_dividers,
|
|
||||||
_merge_content_zones_across_boxes,
|
|
||||||
_build_zone_grid,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Re-export from cv_words_first (used by cv_box_layout.py) ---------------
|
|
||||||
from cv_words_first import _cluster_rows # noqa: F401
|
|
||||||
|
|||||||
@@ -1,389 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to grid/editor/zones.py
|
||||||
Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Split from grid_editor_helpers.py for maintainability.
|
_sys.modules[__name__] = _importlib.import_module("grid.editor.zones")
|
||||||
All functions are pure computation — no HTTP, DB, or session side effects.
|
|
||||||
|
|
||||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
||||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from cv_vocab_types import PageZone
|
|
||||||
from cv_words_first import _cluster_rows, _build_cells
|
|
||||||
|
|
||||||
from grid_editor_columns import (
|
|
||||||
_cluster_columns_by_alignment,
|
|
||||||
_merge_inline_marker_columns,
|
|
||||||
_split_cross_column_words,
|
|
||||||
)
|
|
||||||
from grid_editor_headers import (
|
|
||||||
_detect_header_rows,
|
|
||||||
_detect_colspan_cells,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Vertical divider detection and zone splitting
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_vertical_dividers(
|
|
||||||
words: List[Dict],
|
|
||||||
zone_x: int,
|
|
||||||
zone_w: int,
|
|
||||||
zone_y: int,
|
|
||||||
zone_h: int,
|
|
||||||
) -> List[float]:
|
|
||||||
"""Detect vertical divider lines from pipe word_boxes at consistent x.
|
|
||||||
|
|
||||||
Returns list of divider x-positions (empty if no dividers found).
|
|
||||||
"""
|
|
||||||
if not words or zone_w <= 0 or zone_h <= 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Collect pipe word_boxes
|
|
||||||
pipes = [
|
|
||||||
w for w in words
|
|
||||||
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
|
||||||
]
|
|
||||||
if len(pipes) < 5:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Cluster pipe x-centers by proximity
|
|
||||||
tolerance = max(15, int(zone_w * 0.02))
|
|
||||||
pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
|
|
||||||
|
|
||||||
clusters: List[List[float]] = [[pipe_xs[0]]]
|
|
||||||
for x in pipe_xs[1:]:
|
|
||||||
if x - clusters[-1][-1] <= tolerance:
|
|
||||||
clusters[-1].append(x)
|
|
||||||
else:
|
|
||||||
clusters.append([x])
|
|
||||||
|
|
||||||
dividers: List[float] = []
|
|
||||||
for cluster in clusters:
|
|
||||||
if len(cluster) < 5:
|
|
||||||
continue
|
|
||||||
mean_x = sum(cluster) / len(cluster)
|
|
||||||
# Must be between 15% and 85% of zone width
|
|
||||||
rel_pos = (mean_x - zone_x) / zone_w
|
|
||||||
if rel_pos < 0.15 or rel_pos > 0.85:
|
|
||||||
continue
|
|
||||||
# Check vertical coverage: pipes must span >= 50% of zone height
|
|
||||||
cluster_pipes = [
|
|
||||||
w for w in pipes
|
|
||||||
if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
|
|
||||||
]
|
|
||||||
ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
|
|
||||||
y_span = max(ys) - min(ys) if ys else 0
|
|
||||||
if y_span < zone_h * 0.5:
|
|
||||||
continue
|
|
||||||
dividers.append(mean_x)
|
|
||||||
|
|
||||||
return sorted(dividers)
|
|
||||||
|
|
||||||
|
|
||||||
def _split_zone_at_vertical_dividers(
|
|
||||||
zone: "PageZone",
|
|
||||||
divider_xs: List[float],
|
|
||||||
vsplit_group_id: int,
|
|
||||||
) -> List["PageZone"]:
|
|
||||||
"""Split a PageZone at vertical divider positions into sub-zones."""
|
|
||||||
boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
|
|
||||||
hints = []
|
|
||||||
for i in range(len(boundaries) - 1):
|
|
||||||
if i == 0:
|
|
||||||
hints.append("left_of_vsplit")
|
|
||||||
elif i == len(boundaries) - 2:
|
|
||||||
hints.append("right_of_vsplit")
|
|
||||||
else:
|
|
||||||
hints.append("middle_of_vsplit")
|
|
||||||
|
|
||||||
sub_zones = []
|
|
||||||
for i in range(len(boundaries) - 1):
|
|
||||||
x_start = int(boundaries[i])
|
|
||||||
x_end = int(boundaries[i + 1])
|
|
||||||
sub = PageZone(
|
|
||||||
index=0, # re-indexed later
|
|
||||||
zone_type=zone.zone_type,
|
|
||||||
y=zone.y,
|
|
||||||
height=zone.height,
|
|
||||||
x=x_start,
|
|
||||||
width=x_end - x_start,
|
|
||||||
box=zone.box,
|
|
||||||
image_overlays=zone.image_overlays,
|
|
||||||
layout_hint=hints[i],
|
|
||||||
vsplit_group=vsplit_group_id,
|
|
||||||
)
|
|
||||||
sub_zones.append(sub)
|
|
||||||
|
|
||||||
return sub_zones
|
|
||||||
|
|
||||||
|
|
||||||
def _merge_content_zones_across_boxes(
|
|
||||||
zones: List,
|
|
||||||
content_x: int,
|
|
||||||
content_w: int,
|
|
||||||
) -> List:
|
|
||||||
"""Merge content zones separated by box zones into single zones.
|
|
||||||
|
|
||||||
Box zones become image_overlays on the merged content zone.
|
|
||||||
Pattern: [content, box*, content] -> [merged_content with overlay]
|
|
||||||
Box zones NOT between two content zones stay as standalone zones.
|
|
||||||
"""
|
|
||||||
if len(zones) < 3:
|
|
||||||
return zones
|
|
||||||
|
|
||||||
# Group consecutive runs of [content, box+, content]
|
|
||||||
result: List = []
|
|
||||||
i = 0
|
|
||||||
while i < len(zones):
|
|
||||||
z = zones[i]
|
|
||||||
if z.zone_type != "content":
|
|
||||||
result.append(z)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Start of a potential merge group: content zone
|
|
||||||
group_contents = [z]
|
|
||||||
group_boxes = []
|
|
||||||
j = i + 1
|
|
||||||
# Absorb [box, content] pairs -- only absorb a box if it's
|
|
||||||
# confirmed to be followed by another content zone.
|
|
||||||
while j < len(zones):
|
|
||||||
if (zones[j].zone_type == "box"
|
|
||||||
and j + 1 < len(zones)
|
|
||||||
and zones[j + 1].zone_type == "content"):
|
|
||||||
group_boxes.append(zones[j])
|
|
||||||
group_contents.append(zones[j + 1])
|
|
||||||
j += 2
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
if len(group_contents) >= 2 and group_boxes:
|
|
||||||
# Merge: create one large content zone spanning all
|
|
||||||
y_min = min(c.y for c in group_contents)
|
|
||||||
y_max = max(c.y + c.height for c in group_contents)
|
|
||||||
overlays = []
|
|
||||||
for bz in group_boxes:
|
|
||||||
overlay = {
|
|
||||||
"y": bz.y,
|
|
||||||
"height": bz.height,
|
|
||||||
"x": bz.x,
|
|
||||||
"width": bz.width,
|
|
||||||
}
|
|
||||||
if bz.box:
|
|
||||||
overlay["box"] = {
|
|
||||||
"x": bz.box.x,
|
|
||||||
"y": bz.box.y,
|
|
||||||
"width": bz.box.width,
|
|
||||||
"height": bz.box.height,
|
|
||||||
"confidence": bz.box.confidence,
|
|
||||||
"border_thickness": bz.box.border_thickness,
|
|
||||||
}
|
|
||||||
overlays.append(overlay)
|
|
||||||
|
|
||||||
merged = PageZone(
|
|
||||||
index=0, # re-indexed below
|
|
||||||
zone_type="content",
|
|
||||||
y=y_min,
|
|
||||||
height=y_max - y_min,
|
|
||||||
x=content_x,
|
|
||||||
width=content_w,
|
|
||||||
image_overlays=overlays,
|
|
||||||
)
|
|
||||||
result.append(merged)
|
|
||||||
i = j
|
|
||||||
else:
|
|
||||||
# No merge possible -- emit just the content zone
|
|
||||||
result.append(z)
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# Re-index zones
|
|
||||||
for idx, z in enumerate(result):
|
|
||||||
z.index = idx
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"zone-merge: %d zones -> %d zones after merging across boxes",
|
|
||||||
len(zones), len(result),
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def _build_zone_grid(
|
|
||||||
zone_words: List[Dict],
|
|
||||||
zone_x: int,
|
|
||||||
zone_y: int,
|
|
||||||
zone_w: int,
|
|
||||||
zone_h: int,
|
|
||||||
zone_index: int,
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
global_columns: Optional[List[Dict]] = None,
|
|
||||||
skip_first_row_header: bool = False,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Build columns, rows, cells for a single zone from its words.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
global_columns: If provided, use these pre-computed column boundaries
|
|
||||||
instead of detecting columns per zone. Used for content zones so
|
|
||||||
that all content zones (above/between/below boxes) share the same
|
|
||||||
column structure. Box zones always detect columns independently.
|
|
||||||
"""
|
|
||||||
if not zone_words:
|
|
||||||
return {
|
|
||||||
"columns": [],
|
|
||||||
"rows": [],
|
|
||||||
"cells": [],
|
|
||||||
"header_rows": [],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Cluster rows first (needed for column alignment analysis)
|
|
||||||
rows = _cluster_rows(zone_words)
|
|
||||||
|
|
||||||
# Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
|
|
||||||
if len(zone_words) <= 60:
|
|
||||||
import statistics as _st
|
|
||||||
_heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
|
|
||||||
_med_h = _st.median(_heights) if _heights else 20
|
|
||||||
_y_tol = max(_med_h * 0.5, 5)
|
|
||||||
logger.info(
|
|
||||||
"zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
|
|
||||||
zone_index, len(zone_words), _med_h, _y_tol, len(rows),
|
|
||||||
)
|
|
||||||
for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
|
|
||||||
logger.info(
|
|
||||||
" zone %d word: y=%d x=%d h=%d w=%d '%s'",
|
|
||||||
zone_index, w['top'], w['left'], w['height'], w['width'],
|
|
||||||
w.get('text', '')[:40],
|
|
||||||
)
|
|
||||||
for r in rows:
|
|
||||||
logger.info(
|
|
||||||
" zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
|
|
||||||
zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use global columns if provided, otherwise detect per zone
|
|
||||||
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
|
||||||
|
|
||||||
# Merge inline marker columns (bullets, numbering) into adjacent text
|
|
||||||
if not global_columns:
|
|
||||||
columns = _merge_inline_marker_columns(columns, zone_words)
|
|
||||||
|
|
||||||
if not columns or not rows:
|
|
||||||
return {
|
|
||||||
"columns": [],
|
|
||||||
"rows": [],
|
|
||||||
"cells": [],
|
|
||||||
"header_rows": [],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Split word boxes that straddle column boundaries (e.g. "sichzie"
|
|
||||||
# spanning Col 1 + Col 2). Must happen after column detection and
|
|
||||||
# before cell assignment.
|
|
||||||
# Keep original words for colspan detection (split destroys span info).
|
|
||||||
original_zone_words = zone_words
|
|
||||||
if len(columns) >= 2:
|
|
||||||
zone_words = _split_cross_column_words(zone_words, columns)
|
|
||||||
|
|
||||||
# Build cells
|
|
||||||
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
|
||||||
|
|
||||||
# --- Detect colspan (merged cells spanning multiple columns) ---
|
|
||||||
# Uses the ORIGINAL (pre-split) words to detect word-blocks that span
|
|
||||||
# multiple columns. _split_cross_column_words would have destroyed
|
|
||||||
# this information by cutting words at column boundaries.
|
|
||||||
if len(columns) >= 2:
|
|
||||||
cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
|
|
||||||
|
|
||||||
# Prefix cell IDs with zone index
|
|
||||||
for cell in cells:
|
|
||||||
cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
|
|
||||||
cell["zone_index"] = zone_index
|
|
||||||
|
|
||||||
# Detect header rows (pass columns for spanning header detection)
|
|
||||||
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
|
|
||||||
skip_first_row_header=skip_first_row_header)
|
|
||||||
|
|
||||||
# Merge cells in spanning header rows into a single col-0 cell
|
|
||||||
if header_rows and len(columns) >= 2:
|
|
||||||
for hri in header_rows:
|
|
||||||
header_cells = [c for c in cells if c["row_index"] == hri]
|
|
||||||
if len(header_cells) <= 1:
|
|
||||||
continue
|
|
||||||
# Collect all word_boxes and text from all columns
|
|
||||||
all_wb = []
|
|
||||||
all_text_parts = []
|
|
||||||
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
|
||||||
all_wb.extend(hc.get("word_boxes", []))
|
|
||||||
if hc.get("text", "").strip():
|
|
||||||
all_text_parts.append(hc["text"].strip())
|
|
||||||
# Remove all header cells, replace with one spanning cell
|
|
||||||
cells = [c for c in cells if c["row_index"] != hri]
|
|
||||||
if all_wb:
|
|
||||||
x_min = min(wb["left"] for wb in all_wb)
|
|
||||||
y_min = min(wb["top"] for wb in all_wb)
|
|
||||||
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
||||||
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
||||||
cells.append({
|
|
||||||
"cell_id": f"R{hri:02d}_C0",
|
|
||||||
"row_index": hri,
|
|
||||||
"col_index": 0,
|
|
||||||
"col_type": "spanning_header",
|
|
||||||
"text": " ".join(all_text_parts),
|
|
||||||
"confidence": 0.0,
|
|
||||||
"bbox_px": {"x": x_min, "y": y_min,
|
|
||||||
"w": x_max - x_min, "h": y_max - y_min},
|
|
||||||
"bbox_pct": {
|
|
||||||
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
||||||
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
||||||
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
||||||
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
||||||
},
|
|
||||||
"word_boxes": all_wb,
|
|
||||||
"ocr_engine": "words_first",
|
|
||||||
"is_bold": True,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Convert columns to output format with percentages
|
|
||||||
out_columns = []
|
|
||||||
for col in columns:
|
|
||||||
x_min = col["x_min"]
|
|
||||||
x_max = col["x_max"]
|
|
||||||
out_columns.append({
|
|
||||||
"index": col["index"],
|
|
||||||
"label": col["type"],
|
|
||||||
"x_min_px": round(x_min),
|
|
||||||
"x_max_px": round(x_max),
|
|
||||||
"x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
||||||
"x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
|
|
||||||
"bold": False,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Convert rows to output format with percentages
|
|
||||||
out_rows = []
|
|
||||||
for row in rows:
|
|
||||||
out_rows.append({
|
|
||||||
"index": row["index"],
|
|
||||||
"y_min_px": round(row["y_min"]),
|
|
||||||
"y_max_px": round(row["y_max"]),
|
|
||||||
"y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
|
|
||||||
"y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
|
|
||||||
"is_header": row["index"] in header_rows,
|
|
||||||
})
|
|
||||||
|
|
||||||
return {
|
|
||||||
"columns": out_columns,
|
|
||||||
"rows": out_rows,
|
|
||||||
"cells": cells,
|
|
||||||
"header_rows": header_rows,
|
|
||||||
"_raw_columns": columns, # internal: for propagation to other zones
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
"""
|
||||||
|
Vocab package — restructured from vocab_* flat modules.
|
||||||
|
|
||||||
|
Backward-compatible re-exports: consumers can still use
|
||||||
|
``from vocab_worksheet_api import ...`` etc. via the shim files in backend/.
|
||||||
|
"""
|
||||||
@@ -0,0 +1,196 @@
|
|||||||
|
"""
|
||||||
|
Vocab Learn Bridge — Converts vocabulary session data into Learning Units.
|
||||||
|
|
||||||
|
Bridges klausur-service (vocab extraction) with backend-lehrer (learning units + generators).
|
||||||
|
Creates a Learning Unit in backend-lehrer, then triggers MC/Cloze/QA generation.
|
||||||
|
|
||||||
|
DATENSCHUTZ: All communication stays within Docker network (breakpilot-network).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import httpx
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BACKEND_LEHRER_URL = os.getenv("BACKEND_LEHRER_URL", "http://backend-lehrer:8001")
|
||||||
|
|
||||||
|
|
||||||
|
def vocab_to_analysis_data(session_name: str, vocabulary: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert vocabulary entries from a vocab session into the analysis_data format
|
||||||
|
expected by backend-lehrer generators (MC, Cloze, QA).
|
||||||
|
|
||||||
|
The generators consume:
|
||||||
|
- title: Display name
|
||||||
|
- subject: Subject area
|
||||||
|
- grade_level: Target grade
|
||||||
|
- canonical_text: Full text representation
|
||||||
|
- printed_blocks: Individual text blocks
|
||||||
|
- vocabulary: Original vocab data (for vocab-specific modules)
|
||||||
|
"""
|
||||||
|
canonical_lines = []
|
||||||
|
printed_blocks = []
|
||||||
|
|
||||||
|
for v in vocabulary:
|
||||||
|
en = v.get("english", "").strip()
|
||||||
|
de = v.get("german", "").strip()
|
||||||
|
example = v.get("example_sentence", "").strip()
|
||||||
|
|
||||||
|
if not en and not de:
|
||||||
|
continue
|
||||||
|
|
||||||
|
line = f"{en} = {de}"
|
||||||
|
if example:
|
||||||
|
line += f" ({example})"
|
||||||
|
canonical_lines.append(line)
|
||||||
|
|
||||||
|
block_text = f"{en} — {de}"
|
||||||
|
if example:
|
||||||
|
block_text += f" | {example}"
|
||||||
|
printed_blocks.append({"text": block_text})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": session_name,
|
||||||
|
"subject": "English Vocabulary",
|
||||||
|
"grade_level": "5-8",
|
||||||
|
"canonical_text": "\n".join(canonical_lines),
|
||||||
|
"printed_blocks": printed_blocks,
|
||||||
|
"vocabulary": vocabulary,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def create_learning_unit(
|
||||||
|
session_name: str,
|
||||||
|
vocabulary: List[Dict[str, Any]],
|
||||||
|
grade: Optional[str] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Create a Learning Unit in backend-lehrer from vocabulary data.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1. Create unit via POST /api/learning-units/
|
||||||
|
2. Return the created unit info
|
||||||
|
|
||||||
|
Returns dict with unit_id, status, vocabulary_count.
|
||||||
|
"""
|
||||||
|
if not vocabulary:
|
||||||
|
raise ValueError("No vocabulary entries provided")
|
||||||
|
|
||||||
|
analysis_data = vocab_to_analysis_data(session_name, vocabulary)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
# 1. Create Learning Unit
|
||||||
|
create_payload = {
|
||||||
|
"title": session_name,
|
||||||
|
"subject": "Englisch",
|
||||||
|
"grade": grade or "5-8",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{BACKEND_LEHRER_URL}/api/learning-units/",
|
||||||
|
json=create_payload,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
unit = resp.json()
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"Failed to create learning unit: {e}")
|
||||||
|
raise RuntimeError(f"Backend-Lehrer nicht erreichbar: {e}")
|
||||||
|
|
||||||
|
unit_id = unit.get("id")
|
||||||
|
if not unit_id:
|
||||||
|
raise RuntimeError("Learning Unit created but no ID returned")
|
||||||
|
|
||||||
|
logger.info(f"Created learning unit {unit_id} with {len(vocabulary)} vocabulary entries")
|
||||||
|
|
||||||
|
# 2. Save analysis_data as JSON file for generators
|
||||||
|
analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
|
||||||
|
os.makedirs(analysis_dir, exist_ok=True)
|
||||||
|
analysis_path = os.path.join(analysis_dir, f"{unit_id}_analyse.json")
|
||||||
|
|
||||||
|
with open(analysis_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(analysis_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"Saved analysis data to {analysis_path}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"unit_id": unit_id,
|
||||||
|
"unit": unit,
|
||||||
|
"analysis_path": analysis_path,
|
||||||
|
"vocabulary_count": len(vocabulary),
|
||||||
|
"status": "created",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_learning_modules(
|
||||||
|
unit_id: str,
|
||||||
|
analysis_path: str,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Trigger MC, Cloze, and QA generation from analysis data.
|
||||||
|
|
||||||
|
Imports generators directly (they run in-process for klausur-service)
|
||||||
|
or calls backend-lehrer API if generators aren't available locally.
|
||||||
|
|
||||||
|
Returns dict with generation results.
|
||||||
|
"""
|
||||||
|
results = {
|
||||||
|
"unit_id": unit_id,
|
||||||
|
"mc": {"status": "pending"},
|
||||||
|
"cloze": {"status": "pending"},
|
||||||
|
"qa": {"status": "pending"},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Load analysis data
|
||||||
|
with open(analysis_path, "r", encoding="utf-8") as f:
|
||||||
|
analysis_data = json.load(f)
|
||||||
|
|
||||||
|
# Try to generate via backend-lehrer API
|
||||||
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
|
# Generate QA (includes Leitner fields)
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-qa",
|
||||||
|
json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 20)},
|
||||||
|
)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
results["qa"] = {"status": "generated", "data": resp.json()}
|
||||||
|
else:
|
||||||
|
logger.warning(f"QA generation returned {resp.status_code}")
|
||||||
|
results["qa"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"QA generation failed: {e}")
|
||||||
|
results["qa"] = {"status": "error", "reason": str(e)}
|
||||||
|
|
||||||
|
# Generate MC
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-mc",
|
||||||
|
json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 10)},
|
||||||
|
)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
results["mc"] = {"status": "generated", "data": resp.json()}
|
||||||
|
else:
|
||||||
|
results["mc"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"MC generation failed: {e}")
|
||||||
|
results["mc"] = {"status": "error", "reason": str(e)}
|
||||||
|
|
||||||
|
# Generate Cloze
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-cloze",
|
||||||
|
json={"analysis_data": analysis_data},
|
||||||
|
)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
results["cloze"] = {"status": "generated", "data": resp.json()}
|
||||||
|
else:
|
||||||
|
results["cloze"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Cloze generation failed: {e}")
|
||||||
|
results["cloze"] = {"status": "error", "reason": str(e)}
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -0,0 +1,427 @@
|
|||||||
|
"""
|
||||||
|
Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions.
|
||||||
|
|
||||||
|
Replaces in-memory storage with database persistence.
|
||||||
|
See migrations/001_vocab_sessions.sql for schema.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
from typing import Optional, List, Dict, Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Database configuration
|
||||||
|
DATABASE_URL = os.getenv(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Connection pool (initialized lazily)
|
||||||
|
_pool: Optional[asyncpg.Pool] = None
|
||||||
|
|
||||||
|
|
||||||
|
async def get_pool() -> asyncpg.Pool:
|
||||||
|
"""Get or create the database connection pool."""
|
||||||
|
global _pool
|
||||||
|
if _pool is None:
|
||||||
|
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
|
||||||
|
return _pool
|
||||||
|
|
||||||
|
|
||||||
|
async def init_vocab_tables():
|
||||||
|
"""
|
||||||
|
Initialize vocab tables if they don't exist.
|
||||||
|
This is called at startup.
|
||||||
|
"""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
# Check if tables exist
|
||||||
|
tables_exist = await conn.fetchval("""
|
||||||
|
SELECT EXISTS (
|
||||||
|
SELECT FROM information_schema.tables
|
||||||
|
WHERE table_name = 'vocab_sessions'
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
if not tables_exist:
|
||||||
|
logger.info("Creating vocab tables...")
|
||||||
|
# Read and execute migration
|
||||||
|
migration_path = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"migrations/001_vocab_sessions.sql"
|
||||||
|
)
|
||||||
|
if os.path.exists(migration_path):
|
||||||
|
with open(migration_path, "r") as f:
|
||||||
|
sql = f.read()
|
||||||
|
await conn.execute(sql)
|
||||||
|
logger.info("Vocab tables created successfully")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Migration file not found: {migration_path}")
|
||||||
|
else:
|
||||||
|
logger.debug("Vocab tables already exist")
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# SESSION OPERATIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
async def create_session_db(
|
||||||
|
session_id: str,
|
||||||
|
name: str,
|
||||||
|
description: str = "",
|
||||||
|
source_language: str = "en",
|
||||||
|
target_language: str = "de"
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Create a new vocabulary session in the database."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
INSERT INTO vocab_sessions (
|
||||||
|
id, name, description, source_language, target_language,
|
||||||
|
status, vocabulary_count
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, 'pending', 0)
|
||||||
|
RETURNING *
|
||||||
|
""", uuid.UUID(session_id), name, description, source_language, target_language)
|
||||||
|
|
||||||
|
return _row_to_dict(row)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get a session by ID."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
SELECT * FROM vocab_sessions WHERE id = $1
|
||||||
|
""", uuid.UUID(session_id))
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return _row_to_dict(row)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def list_sessions_db(
|
||||||
|
limit: int = 50,
|
||||||
|
offset: int = 0,
|
||||||
|
status: Optional[str] = None
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""List all sessions with optional filtering."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
if status:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT * FROM vocab_sessions
|
||||||
|
WHERE status = $1
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
""", status, limit, offset)
|
||||||
|
else:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT * FROM vocab_sessions
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT $1 OFFSET $2
|
||||||
|
""", limit, offset)
|
||||||
|
|
||||||
|
return [_row_to_dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
async def update_session_db(
|
||||||
|
session_id: str,
|
||||||
|
**kwargs
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Update a session with given fields."""
|
||||||
|
pool = await get_pool()
|
||||||
|
|
||||||
|
# Build dynamic UPDATE query
|
||||||
|
fields = []
|
||||||
|
values = []
|
||||||
|
param_idx = 1
|
||||||
|
|
||||||
|
allowed_fields = [
|
||||||
|
'name', 'description', 'status', 'vocabulary_count',
|
||||||
|
'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count',
|
||||||
|
'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages'
|
||||||
|
]
|
||||||
|
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if key in allowed_fields:
|
||||||
|
fields.append(f"{key} = ${param_idx}")
|
||||||
|
# Convert dicts/lists to JSON for JSONB columns
|
||||||
|
if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']:
|
||||||
|
value = json.dumps(value) if value else None
|
||||||
|
values.append(value)
|
||||||
|
param_idx += 1
|
||||||
|
|
||||||
|
if not fields:
|
||||||
|
return await get_session_db(session_id)
|
||||||
|
|
||||||
|
values.append(uuid.UUID(session_id))
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow(f"""
|
||||||
|
UPDATE vocab_sessions
|
||||||
|
SET {', '.join(fields)}
|
||||||
|
WHERE id = ${param_idx}
|
||||||
|
RETURNING *
|
||||||
|
""", *values)
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return _row_to_dict(row)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_session_db(session_id: str) -> bool:
|
||||||
|
"""Delete a session and all related data (cascades)."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
result = await conn.execute("""
|
||||||
|
DELETE FROM vocab_sessions WHERE id = $1
|
||||||
|
""", uuid.UUID(session_id))
|
||||||
|
return result == "DELETE 1"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# VOCABULARY OPERATIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
async def add_vocabulary_db(
|
||||||
|
session_id: str,
|
||||||
|
vocab_list: List[Dict[str, Any]]
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Add vocabulary entries to a session."""
|
||||||
|
if not vocab_list:
|
||||||
|
return []
|
||||||
|
|
||||||
|
pool = await get_pool()
|
||||||
|
results = []
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
for vocab in vocab_list:
|
||||||
|
vocab_id = str(uuid.uuid4())
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
INSERT INTO vocab_entries (
|
||||||
|
id, session_id, english, german, example_sentence,
|
||||||
|
example_sentence_gap, word_type, source_page
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||||
|
RETURNING *
|
||||||
|
""",
|
||||||
|
uuid.UUID(vocab_id),
|
||||||
|
uuid.UUID(session_id),
|
||||||
|
vocab.get('english', ''),
|
||||||
|
vocab.get('german', ''),
|
||||||
|
vocab.get('example_sentence'),
|
||||||
|
vocab.get('example_sentence_gap'),
|
||||||
|
vocab.get('word_type'),
|
||||||
|
vocab.get('source_page')
|
||||||
|
)
|
||||||
|
results.append(_row_to_dict(row))
|
||||||
|
|
||||||
|
# Update vocabulary count
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE vocab_sessions
|
||||||
|
SET vocabulary_count = (
|
||||||
|
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
|
||||||
|
)
|
||||||
|
WHERE id = $1
|
||||||
|
""", uuid.UUID(session_id))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def get_vocabulary_db(
|
||||||
|
session_id: str,
|
||||||
|
source_page: Optional[int] = None
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Get vocabulary entries for a session."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
if source_page is not None:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT * FROM vocab_entries
|
||||||
|
WHERE session_id = $1 AND source_page = $2
|
||||||
|
ORDER BY created_at
|
||||||
|
""", uuid.UUID(session_id), source_page)
|
||||||
|
else:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT * FROM vocab_entries
|
||||||
|
WHERE session_id = $1
|
||||||
|
ORDER BY source_page NULLS LAST, created_at
|
||||||
|
""", uuid.UUID(session_id))
|
||||||
|
|
||||||
|
return [_row_to_dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
async def update_vocabulary_db(
|
||||||
|
entry_id: str,
|
||||||
|
**kwargs
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Update a single vocabulary entry."""
|
||||||
|
pool = await get_pool()
|
||||||
|
|
||||||
|
fields = []
|
||||||
|
values = []
|
||||||
|
param_idx = 1
|
||||||
|
|
||||||
|
allowed_fields = [
|
||||||
|
'english', 'german', 'example_sentence', 'example_sentence_gap',
|
||||||
|
'word_type', 'source_page'
|
||||||
|
]
|
||||||
|
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if key in allowed_fields:
|
||||||
|
fields.append(f"{key} = ${param_idx}")
|
||||||
|
values.append(value)
|
||||||
|
param_idx += 1
|
||||||
|
|
||||||
|
if not fields:
|
||||||
|
return None
|
||||||
|
|
||||||
|
values.append(uuid.UUID(entry_id))
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow(f"""
|
||||||
|
UPDATE vocab_entries
|
||||||
|
SET {', '.join(fields)}
|
||||||
|
WHERE id = ${param_idx}
|
||||||
|
RETURNING *
|
||||||
|
""", *values)
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return _row_to_dict(row)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def clear_page_vocabulary_db(session_id: str, page: int) -> int:
|
||||||
|
"""Clear all vocabulary for a specific page."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
result = await conn.execute("""
|
||||||
|
DELETE FROM vocab_entries
|
||||||
|
WHERE session_id = $1 AND source_page = $2
|
||||||
|
""", uuid.UUID(session_id), page)
|
||||||
|
|
||||||
|
# Update vocabulary count
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE vocab_sessions
|
||||||
|
SET vocabulary_count = (
|
||||||
|
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
|
||||||
|
)
|
||||||
|
WHERE id = $1
|
||||||
|
""", uuid.UUID(session_id))
|
||||||
|
|
||||||
|
# Return count of deleted rows
|
||||||
|
count = int(result.split()[-1]) if result else 0
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# WORKSHEET OPERATIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
async def create_worksheet_db(
|
||||||
|
session_id: str,
|
||||||
|
worksheet_types: List[str],
|
||||||
|
pdf_path: Optional[str] = None,
|
||||||
|
solution_path: Optional[str] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Create a worksheet record."""
|
||||||
|
pool = await get_pool()
|
||||||
|
worksheet_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
INSERT INTO vocab_worksheets (
|
||||||
|
id, session_id, worksheet_types, pdf_path, solution_path
|
||||||
|
) VALUES ($1, $2, $3, $4, $5)
|
||||||
|
RETURNING *
|
||||||
|
""",
|
||||||
|
uuid.UUID(worksheet_id),
|
||||||
|
uuid.UUID(session_id),
|
||||||
|
json.dumps(worksheet_types),
|
||||||
|
pdf_path,
|
||||||
|
solution_path
|
||||||
|
)
|
||||||
|
|
||||||
|
return _row_to_dict(row)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get a worksheet by ID."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
SELECT * FROM vocab_worksheets WHERE id = $1
|
||||||
|
""", uuid.UUID(worksheet_id))
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return _row_to_dict(row)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_worksheets_for_session_db(session_id: str) -> int:
|
||||||
|
"""Delete all worksheets for a session."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
result = await conn.execute("""
|
||||||
|
DELETE FROM vocab_worksheets WHERE session_id = $1
|
||||||
|
""", uuid.UUID(session_id))
|
||||||
|
|
||||||
|
count = int(result.split()[-1]) if result else 0
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PDF CACHE OPERATIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Simple in-memory cache for PDF data (temporary until served)
|
||||||
|
_pdf_cache: Dict[str, bytes] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None:
|
||||||
|
"""Cache PDF data temporarily for download."""
|
||||||
|
_pdf_cache[worksheet_id] = pdf_data
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]:
|
||||||
|
"""Get cached PDF data."""
|
||||||
|
return _pdf_cache.get(worksheet_id)
|
||||||
|
|
||||||
|
|
||||||
|
def clear_cached_pdf_data(worksheet_id: str) -> None:
|
||||||
|
"""Clear cached PDF data."""
|
||||||
|
_pdf_cache.pop(worksheet_id, None)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# HELPER FUNCTIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
|
||||||
|
"""Convert asyncpg Record to dict with proper type handling."""
|
||||||
|
if row is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result = dict(row)
|
||||||
|
|
||||||
|
# Convert UUIDs to strings
|
||||||
|
for key in ['id', 'session_id']:
|
||||||
|
if key in result and result[key] is not None:
|
||||||
|
result[key] = str(result[key])
|
||||||
|
|
||||||
|
# Convert datetimes to ISO strings
|
||||||
|
for key in ['created_at', 'updated_at', 'generated_at']:
|
||||||
|
if key in result and result[key] is not None:
|
||||||
|
result[key] = result[key].isoformat()
|
||||||
|
|
||||||
|
# Parse JSONB fields back to dicts/lists
|
||||||
|
for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']:
|
||||||
|
if key in result and result[key] is not None:
|
||||||
|
if isinstance(result[key], str):
|
||||||
|
result[key] = json.loads(result[key])
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
"""
|
||||||
|
Vocab worksheet sub-package.
|
||||||
|
|
||||||
|
Main entry point: ``from vocab.worksheet.api import router``
|
||||||
|
"""
|
||||||
@@ -0,0 +1,472 @@
|
|||||||
|
"""
|
||||||
|
Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
|
||||||
|
extract-with-boxes, deskewed images, and learning unit generation.
|
||||||
|
|
||||||
|
The two large handlers (compare_ocr_methods, analyze_grid) live in
|
||||||
|
vocab_worksheet_compare_api.py and are included via compare_router.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Body, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
from datetime import datetime
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
def _get_sessions():
|
||||||
|
from .api import _sessions
|
||||||
|
return _sessions
|
||||||
|
|
||||||
|
def _get_local_storage_path():
|
||||||
|
from .api import LOCAL_STORAGE_PATH
|
||||||
|
return LOCAL_STORAGE_PATH
|
||||||
|
from .generation import convert_pdf_page_to_image
|
||||||
|
|
||||||
|
# Try to import Tesseract extractor
|
||||||
|
try:
|
||||||
|
from tesseract_vocab_extractor import (
|
||||||
|
extract_bounding_boxes, TESSERACT_AVAILABLE,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
TESSERACT_AVAILABLE = False
|
||||||
|
|
||||||
|
# Try to import Grid Detection Service
|
||||||
|
try:
|
||||||
|
from services.grid_detection_service import GridDetectionService
|
||||||
|
GRID_SERVICE_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
GRID_SERVICE_AVAILABLE = False
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
analysis_router = APIRouter()
|
||||||
|
|
||||||
|
def _ocr_export_dir():
|
||||||
|
return os.path.join(_get_local_storage_path(), "ocr-exports")
|
||||||
|
|
||||||
|
def _ground_truth_dir():
|
||||||
|
return os.path.join(_get_local_storage_path(), "ground-truth")
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# OCR Export Endpoints (for cross-app OCR data sharing)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
|
||||||
|
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
|
||||||
|
"""
|
||||||
|
Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
|
||||||
|
|
||||||
|
Both apps proxy to klausur-service via /klausur-api/, so this endpoint
|
||||||
|
serves as shared storage accessible from both ports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
|
||||||
|
|
||||||
|
os.makedirs(_ocr_export_dir(), exist_ok=True)
|
||||||
|
|
||||||
|
# Save the export data
|
||||||
|
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||||
|
with open(export_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
# Update latest pointer
|
||||||
|
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
||||||
|
with open(latest_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump({
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_number": page_number,
|
||||||
|
"saved_at": datetime.utcnow().isoformat(),
|
||||||
|
}, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_number": page_number,
|
||||||
|
"message": "OCR export saved successfully",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
|
||||||
|
async def load_ocr_export(session_id: str, page_number: int):
|
||||||
|
"""Load a specific OCR export by session and page number."""
|
||||||
|
|
||||||
|
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||||
|
|
||||||
|
if not os.path.exists(export_path):
|
||||||
|
raise HTTPException(status_code=404, detail="OCR export not found")
|
||||||
|
|
||||||
|
with open(export_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.get("/ocr-export/latest")
|
||||||
|
async def load_latest_ocr_export():
|
||||||
|
"""Load the most recently saved OCR export data."""
|
||||||
|
|
||||||
|
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
||||||
|
|
||||||
|
if not os.path.exists(latest_path):
|
||||||
|
raise HTTPException(status_code=404, detail="No OCR exports found")
|
||||||
|
|
||||||
|
with open(latest_path, 'r', encoding='utf-8') as f:
|
||||||
|
pointer = json.load(f)
|
||||||
|
|
||||||
|
session_id = pointer.get("session_id")
|
||||||
|
page_number = pointer.get("page_number")
|
||||||
|
|
||||||
|
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||||
|
|
||||||
|
if not os.path.exists(export_path):
|
||||||
|
raise HTTPException(status_code=404, detail="Latest OCR export file not found")
|
||||||
|
|
||||||
|
with open(export_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Extract with Boxes & Deskewed Image
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
|
||||||
|
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
|
||||||
|
|
||||||
|
Returns dict with 'entries' list and 'image_width'/'image_height'.
|
||||||
|
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
|
||||||
|
All bbox coordinates are in percent (0-100).
|
||||||
|
"""
|
||||||
|
if not TESSERACT_AVAILABLE:
|
||||||
|
raise HTTPException(status_code=500, detail="Tesseract not available")
|
||||||
|
if not GRID_SERVICE_AVAILABLE:
|
||||||
|
raise HTTPException(status_code=500, detail="GridDetectionService not available")
|
||||||
|
|
||||||
|
# Step 1: Tesseract word-level bounding boxes
|
||||||
|
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
|
||||||
|
words = tess_result.get("words", [])
|
||||||
|
img_w = tess_result.get("image_width", 0)
|
||||||
|
img_h = tess_result.get("image_height", 0)
|
||||||
|
|
||||||
|
if not words or img_w == 0 or img_h == 0:
|
||||||
|
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||||
|
|
||||||
|
# Step 2: Convert to OCR regions (percentage-based)
|
||||||
|
service = GridDetectionService()
|
||||||
|
regions = service.convert_tesseract_regions(words, img_w, img_h)
|
||||||
|
|
||||||
|
if not regions:
|
||||||
|
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||||
|
|
||||||
|
# Step 3: Detect grid
|
||||||
|
grid_result = service.detect_grid(regions)
|
||||||
|
|
||||||
|
if not grid_result.cells:
|
||||||
|
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||||
|
|
||||||
|
# Step 4: Group cells by logical_row and column_type
|
||||||
|
from services.grid_detection_service import ColumnType
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
for row_idx, row_cells in enumerate(grid_result.cells):
|
||||||
|
en_text = ""
|
||||||
|
de_text = ""
|
||||||
|
ex_text = ""
|
||||||
|
en_bbox = None
|
||||||
|
de_bbox = None
|
||||||
|
ex_bbox = None
|
||||||
|
row_conf_sum = 0.0
|
||||||
|
row_conf_count = 0
|
||||||
|
|
||||||
|
for cell in row_cells:
|
||||||
|
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
|
||||||
|
"w": round(cell.width, 2), "h": round(cell.height, 2)}
|
||||||
|
|
||||||
|
if cell.column_type == ColumnType.ENGLISH:
|
||||||
|
en_text = cell.text.strip()
|
||||||
|
en_bbox = cell_bbox
|
||||||
|
elif cell.column_type == ColumnType.GERMAN:
|
||||||
|
de_text = cell.text.strip()
|
||||||
|
de_bbox = cell_bbox
|
||||||
|
elif cell.column_type == ColumnType.EXAMPLE:
|
||||||
|
ex_text = cell.text.strip()
|
||||||
|
ex_bbox = cell_bbox
|
||||||
|
|
||||||
|
if cell.text.strip():
|
||||||
|
row_conf_sum += cell.confidence
|
||||||
|
row_conf_count += 1
|
||||||
|
|
||||||
|
# Skip completely empty rows
|
||||||
|
if not en_text and not de_text and not ex_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Calculate whole-row bounding box
|
||||||
|
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
|
||||||
|
if all_bboxes:
|
||||||
|
row_x = min(b["x"] for b in all_bboxes)
|
||||||
|
row_y = min(b["y"] for b in all_bboxes)
|
||||||
|
row_right = max(b["x"] + b["w"] for b in all_bboxes)
|
||||||
|
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
|
||||||
|
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
|
||||||
|
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
|
||||||
|
else:
|
||||||
|
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
|
||||||
|
|
||||||
|
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
|
||||||
|
|
||||||
|
entries.append({
|
||||||
|
"row_index": row_idx,
|
||||||
|
"english": en_text,
|
||||||
|
"german": de_text,
|
||||||
|
"example": ex_text,
|
||||||
|
"confidence": avg_conf,
|
||||||
|
"bbox": row_bbox,
|
||||||
|
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||||
|
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||||
|
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||||
|
})
|
||||||
|
|
||||||
|
return {"entries": entries, "image_width": img_w, "image_height": img_h}
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
|
||||||
|
async def extract_with_boxes(session_id: str, page_number: int):
|
||||||
|
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
|
||||||
|
|
||||||
|
Uses Tesseract + GridDetectionService for spatial positioning.
|
||||||
|
page_number is 0-indexed.
|
||||||
|
"""
|
||||||
|
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
page_count = session.get("pdf_page_count", 1)
|
||||||
|
if page_number < 0 or page_number >= page_count:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||||
|
|
||||||
|
# Convert page to hires image
|
||||||
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
|
|
||||||
|
# Deskew image before OCR
|
||||||
|
deskew_angle = 0.0
|
||||||
|
try:
|
||||||
|
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
|
||||||
|
if CV2_AVAILABLE:
|
||||||
|
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
|
||||||
|
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Deskew failed for page {page_number}: {e}")
|
||||||
|
|
||||||
|
# Cache deskewed image in session for later serving
|
||||||
|
if "deskewed_images" not in session:
|
||||||
|
session["deskewed_images"] = {}
|
||||||
|
session["deskewed_images"][str(page_number)] = image_data
|
||||||
|
|
||||||
|
# Extract entries with boxes (now on deskewed image)
|
||||||
|
result = await extract_entries_with_boxes(image_data)
|
||||||
|
|
||||||
|
# Cache in session
|
||||||
|
if "gt_entries" not in session:
|
||||||
|
session["gt_entries"] = {}
|
||||||
|
session["gt_entries"][str(page_number)] = result["entries"]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"entries": result["entries"],
|
||||||
|
"entry_count": len(result["entries"]),
|
||||||
|
"image_width": result["image_width"],
|
||||||
|
"image_height": result["image_height"],
|
||||||
|
"deskew_angle": round(deskew_angle, 2),
|
||||||
|
"deskewed": abs(deskew_angle) > 0.05,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
|
||||||
|
async def get_deskewed_image(session_id: str, page_number: int):
|
||||||
|
"""Return the deskewed page image as PNG.
|
||||||
|
|
||||||
|
Falls back to the original hires image if no deskewed version is cached.
|
||||||
|
"""
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
deskewed = session.get("deskewed_images", {}).get(str(page_number))
|
||||||
|
|
||||||
|
if deskewed:
|
||||||
|
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
|
||||||
|
|
||||||
|
# Fallback: render original hires image
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
|
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Ground Truth Labeling
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
||||||
|
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
||||||
|
"""Save ground truth labels for a page.
|
||||||
|
|
||||||
|
Expects body with 'entries' list - each entry has english, german, example,
|
||||||
|
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
|
||||||
|
"""
|
||||||
|
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
entries = data.get("entries", [])
|
||||||
|
if not entries:
|
||||||
|
raise HTTPException(status_code=400, detail="No entries provided")
|
||||||
|
|
||||||
|
# Save in session
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
if "ground_truth" not in session:
|
||||||
|
session["ground_truth"] = {}
|
||||||
|
session["ground_truth"][str(page_number)] = entries
|
||||||
|
|
||||||
|
# Also save to disk
|
||||||
|
os.makedirs(_ground_truth_dir(), exist_ok=True)
|
||||||
|
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
||||||
|
gt_data = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_number": page_number,
|
||||||
|
"saved_at": datetime.now().isoformat(),
|
||||||
|
"entry_count": len(entries),
|
||||||
|
"entries": entries,
|
||||||
|
}
|
||||||
|
with open(gt_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(gt_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
|
||||||
|
|
||||||
|
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
|
||||||
|
edited = sum(1 for e in entries if e.get("status") == "edited")
|
||||||
|
skipped = sum(1 for e in entries if e.get("status") == "skipped")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"saved_count": len(entries),
|
||||||
|
"confirmed": confirmed,
|
||||||
|
"edited": edited,
|
||||||
|
"skipped": skipped,
|
||||||
|
"file_path": gt_path,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
|
||||||
|
async def load_ground_truth(session_id: str, page_number: int):
|
||||||
|
"""Load saved ground truth for a page."""
|
||||||
|
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
# Try session cache first
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
cached = session.get("ground_truth", {}).get(str(page_number))
|
||||||
|
if cached:
|
||||||
|
return {"success": True, "entries": cached, "source": "cache"}
|
||||||
|
|
||||||
|
# Try disk
|
||||||
|
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
||||||
|
if not os.path.exists(gt_path):
|
||||||
|
raise HTTPException(status_code=404, detail="No ground truth found for this page")
|
||||||
|
|
||||||
|
with open(gt_path, 'r', encoding='utf-8') as f:
|
||||||
|
gt_data = json.load(f)
|
||||||
|
|
||||||
|
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Learning Module Generation ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateLearningUnitRequest(BaseModel):
|
||||||
|
grade: Optional[str] = None
|
||||||
|
generate_modules: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
|
||||||
|
async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
|
||||||
|
"""
|
||||||
|
Create a Learning Unit from the vocabulary in this session.
|
||||||
|
|
||||||
|
1. Takes vocabulary from the session
|
||||||
|
2. Creates a Learning Unit in backend-lehrer
|
||||||
|
3. Optionally triggers MC/Cloze/QA generation
|
||||||
|
|
||||||
|
Returns the created unit info and generation status.
|
||||||
|
"""
|
||||||
|
if request is None:
|
||||||
|
request = GenerateLearningUnitRequest()
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
vocabulary = session.get("vocabulary", [])
|
||||||
|
|
||||||
|
if not vocabulary:
|
||||||
|
raise HTTPException(status_code=400, detail="No vocabulary in this session")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vocab.learn_bridge import create_learning_unit, generate_learning_modules
|
||||||
|
|
||||||
|
# Step 1: Create Learning Unit
|
||||||
|
result = await create_learning_unit(
|
||||||
|
session_name=session["name"],
|
||||||
|
vocabulary=vocabulary,
|
||||||
|
grade=request.grade,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 2: Generate modules if requested
|
||||||
|
if request.generate_modules:
|
||||||
|
try:
|
||||||
|
gen_result = await generate_learning_modules(
|
||||||
|
unit_id=result["unit_id"],
|
||||||
|
analysis_path=result["analysis_path"],
|
||||||
|
)
|
||||||
|
result["generation"] = gen_result
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Module generation failed (unit created): {e}")
|
||||||
|
result["generation"] = {"status": "error", "reason": str(e)}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
except RuntimeError as e:
|
||||||
|
raise HTTPException(status_code=502, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Include compare_ocr_methods & analyze_grid from companion module
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
from .compare_api import compare_router # noqa: E402
|
||||||
|
|
||||||
|
analysis_router.include_router(compare_router)
|
||||||
@@ -0,0 +1,498 @@
|
|||||||
|
"""
|
||||||
|
Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
|
||||||
|
vocabulary editing, worksheet generation, and PDF downloads.
|
||||||
|
|
||||||
|
Sub-routers (included at bottom):
|
||||||
|
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
|
||||||
|
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Query
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from datetime import datetime
|
||||||
|
import uuid
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# --- Imports from extracted sub-modules ---
|
||||||
|
from .models import (
|
||||||
|
SessionStatus,
|
||||||
|
VocabularyEntry,
|
||||||
|
SessionCreate,
|
||||||
|
SessionResponse,
|
||||||
|
VocabularyResponse,
|
||||||
|
VocabularyUpdate,
|
||||||
|
WorksheetGenerateRequest,
|
||||||
|
WorksheetResponse,
|
||||||
|
)
|
||||||
|
from .extraction import extract_vocabulary_from_image
|
||||||
|
from .generation import (
|
||||||
|
generate_worksheet_html, generate_worksheet_pdf,
|
||||||
|
convert_pdf_page_to_image,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Database integration (used by main.py lifespan) ---
|
||||||
|
try:
|
||||||
|
from vocab.session_store import (
|
||||||
|
DATABASE_URL, get_pool, init_vocab_tables,
|
||||||
|
list_sessions_db, get_session_db,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
|
||||||
|
get_pool = None
|
||||||
|
init_vocab_tables = None
|
||||||
|
list_sessions_db = None
|
||||||
|
get_session_db = None
|
||||||
|
|
||||||
|
_db_pool = None
|
||||||
|
|
||||||
|
|
||||||
|
def set_db_pool(pool):
|
||||||
|
"""Set the database connection pool (called from main.py lifespan)."""
|
||||||
|
global _db_pool
|
||||||
|
_db_pool = pool
|
||||||
|
|
||||||
|
|
||||||
|
async def _init_vocab_table():
|
||||||
|
"""Initialize vocab tables in database."""
|
||||||
|
if init_vocab_tables:
|
||||||
|
try:
|
||||||
|
await init_vocab_tables()
|
||||||
|
logger.info("vocab_session_cache table ready")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to init vocab tables: {e}")
|
||||||
|
else:
|
||||||
|
logger.info("vocab_session_cache table ready")
|
||||||
|
|
||||||
|
|
||||||
|
async def _load_all_sessions():
|
||||||
|
"""Load all vocab sessions from database into memory cache."""
|
||||||
|
if not list_sessions_db:
|
||||||
|
logger.info("Loaded 0 vocab sessions from database")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
sessions = await list_sessions_db(limit=500)
|
||||||
|
count = 0
|
||||||
|
for s in sessions:
|
||||||
|
sid = s.get("id") or s.get("session_id")
|
||||||
|
if sid and sid not in _sessions:
|
||||||
|
_sessions[sid] = {
|
||||||
|
"id": sid,
|
||||||
|
"name": s.get("name", ""),
|
||||||
|
"description": s.get("description", ""),
|
||||||
|
"status": s.get("status", "created"),
|
||||||
|
"vocabulary_count": s.get("vocabulary_count", 0),
|
||||||
|
"source_language": s.get("source_language", "en"),
|
||||||
|
"target_language": s.get("target_language", "de"),
|
||||||
|
"created_at": str(s.get("created_at", "")),
|
||||||
|
}
|
||||||
|
count += 1
|
||||||
|
logger.info(f"Loaded {count} vocab sessions from database")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to load sessions from database: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# --- Router & module-level state ---
|
||||||
|
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
|
||||||
|
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
||||||
|
_sessions: Dict[str, Dict[str, Any]] = {}
|
||||||
|
_worksheets: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions", response_model=SessionResponse)
|
||||||
|
async def create_session(session: SessionCreate):
|
||||||
|
"""Create a new vocabulary extraction session."""
|
||||||
|
session_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
session_data = {
|
||||||
|
"id": session_id,
|
||||||
|
"name": session.name,
|
||||||
|
"description": session.description,
|
||||||
|
"source_language": session.source_language,
|
||||||
|
"target_language": session.target_language,
|
||||||
|
"status": SessionStatus.PENDING.value,
|
||||||
|
"vocabulary": [],
|
||||||
|
"vocabulary_count": 0,
|
||||||
|
"image_path": None,
|
||||||
|
"extraction_confidence": None,
|
||||||
|
"created_at": datetime.utcnow(),
|
||||||
|
}
|
||||||
|
|
||||||
|
_sessions[session_id] = session_data
|
||||||
|
|
||||||
|
# Create storage directory
|
||||||
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||||
|
os.makedirs(session_dir, exist_ok=True)
|
||||||
|
|
||||||
|
return SessionResponse(
|
||||||
|
id=session_id,
|
||||||
|
name=session.name,
|
||||||
|
description=session.description,
|
||||||
|
source_language=session.source_language,
|
||||||
|
target_language=session.target_language,
|
||||||
|
status=SessionStatus.PENDING.value,
|
||||||
|
vocabulary_count=0,
|
||||||
|
image_path=None,
|
||||||
|
created_at=session_data["created_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions", response_model=List[SessionResponse])
|
||||||
|
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
|
||||||
|
"""List all vocabulary sessions."""
|
||||||
|
sessions = sorted(
|
||||||
|
_sessions.values(),
|
||||||
|
key=lambda x: x["created_at"],
|
||||||
|
reverse=True
|
||||||
|
)[:limit]
|
||||||
|
|
||||||
|
return [
|
||||||
|
SessionResponse(
|
||||||
|
id=s["id"],
|
||||||
|
name=s["name"],
|
||||||
|
description=s.get("description"),
|
||||||
|
source_language=s["source_language"],
|
||||||
|
target_language=s["target_language"],
|
||||||
|
status=s["status"],
|
||||||
|
vocabulary_count=s.get("vocabulary_count", 0),
|
||||||
|
image_path=s.get("image_path"),
|
||||||
|
created_at=s["created_at"],
|
||||||
|
)
|
||||||
|
for s in sessions
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}", response_model=SessionResponse)
|
||||||
|
async def get_session(session_id: str):
|
||||||
|
"""Get a specific session."""
|
||||||
|
if session_id not in _sessions:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
s = _sessions[session_id]
|
||||||
|
return SessionResponse(
|
||||||
|
id=s["id"],
|
||||||
|
name=s["name"],
|
||||||
|
description=s.get("description"),
|
||||||
|
source_language=s["source_language"],
|
||||||
|
target_language=s["target_language"],
|
||||||
|
status=s["status"],
|
||||||
|
vocabulary_count=s.get("vocabulary_count", 0),
|
||||||
|
image_path=s.get("image_path"),
|
||||||
|
created_at=s["created_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/upload")
|
||||||
|
async def upload_image(
|
||||||
|
session_id: str,
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Upload a textbook page image or PDF and extract vocabulary.
|
||||||
|
|
||||||
|
Supported formats: PNG, JPG, JPEG, PDF
|
||||||
|
"""
|
||||||
|
logger.info(f"Upload request for session {session_id}")
|
||||||
|
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
|
||||||
|
|
||||||
|
if session_id not in _sessions:
|
||||||
|
logger.error(f"Session {session_id} not found")
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _sessions[session_id]
|
||||||
|
|
||||||
|
# Validate file type - check both extension and content type
|
||||||
|
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
||||||
|
content_type = file.content_type or ''
|
||||||
|
|
||||||
|
# Accept images and PDFs
|
||||||
|
valid_image_extensions = ['png', 'jpg', 'jpeg']
|
||||||
|
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
|
||||||
|
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
|
||||||
|
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
|
||||||
|
|
||||||
|
if not is_pdf and not is_image:
|
||||||
|
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Determine final extension for saving
|
||||||
|
if is_pdf:
|
||||||
|
save_extension = 'png' # PDFs will be converted to PNG
|
||||||
|
elif extension in valid_image_extensions:
|
||||||
|
save_extension = extension
|
||||||
|
elif content_type == 'image/png':
|
||||||
|
save_extension = 'png'
|
||||||
|
else:
|
||||||
|
save_extension = 'jpg'
|
||||||
|
|
||||||
|
# Read file content
|
||||||
|
content = await file.read()
|
||||||
|
logger.info(f"Read {len(content)} bytes from uploaded file")
|
||||||
|
|
||||||
|
# Convert PDF to image if needed
|
||||||
|
if is_pdf:
|
||||||
|
logger.info("Converting PDF to image...")
|
||||||
|
content = await convert_pdf_page_to_image(content, page_number=0)
|
||||||
|
logger.info(f"PDF converted, image size: {len(content)} bytes")
|
||||||
|
|
||||||
|
# Save image
|
||||||
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||||
|
os.makedirs(session_dir, exist_ok=True)
|
||||||
|
image_path = os.path.join(session_dir, f"source.{save_extension}")
|
||||||
|
|
||||||
|
with open(image_path, 'wb') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
# Update session status
|
||||||
|
session["status"] = SessionStatus.PROCESSING.value
|
||||||
|
session["image_path"] = image_path
|
||||||
|
|
||||||
|
# Extract vocabulary using Vision LLM
|
||||||
|
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
|
||||||
|
|
||||||
|
# Update session with extracted vocabulary
|
||||||
|
session["vocabulary"] = [v.dict() for v in vocabulary]
|
||||||
|
session["vocabulary_count"] = len(vocabulary)
|
||||||
|
session["extraction_confidence"] = confidence
|
||||||
|
session["status"] = SessionStatus.EXTRACTED.value
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"filename": file.filename,
|
||||||
|
"image_path": image_path,
|
||||||
|
"vocabulary_count": len(vocabulary),
|
||||||
|
"extraction_confidence": confidence,
|
||||||
|
"status": SessionStatus.EXTRACTED.value,
|
||||||
|
}
|
||||||
|
|
||||||
|
if error:
|
||||||
|
result["error"] = error
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
|
||||||
|
async def get_vocabulary(session_id: str):
|
||||||
|
"""Get extracted vocabulary for a session."""
|
||||||
|
if session_id not in _sessions:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
session = _sessions[session_id]
|
||||||
|
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
||||||
|
return VocabularyResponse(
|
||||||
|
session_id=session_id,
|
||||||
|
vocabulary=vocabulary,
|
||||||
|
extraction_confidence=session.get("extraction_confidence"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/sessions/{session_id}/vocabulary")
|
||||||
|
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
|
||||||
|
"""Update vocabulary entries (for manual corrections)."""
|
||||||
|
if session_id not in _sessions:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _sessions[session_id]
|
||||||
|
session["vocabulary"] = [v.dict() for v in update.vocabulary]
|
||||||
|
session["vocabulary_count"] = len(update.vocabulary)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"vocabulary_count": len(update.vocabulary),
|
||||||
|
"message": "Vocabulary updated successfully",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
|
||||||
|
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
|
||||||
|
"""Generate worksheet PDF(s) from extracted vocabulary."""
|
||||||
|
if session_id not in _sessions:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _sessions[session_id]
|
||||||
|
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
||||||
|
|
||||||
|
if not vocabulary:
|
||||||
|
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
|
||||||
|
|
||||||
|
worksheet_id = str(uuid.uuid4())
|
||||||
|
title = request.title or session["name"]
|
||||||
|
|
||||||
|
# Generate HTML for each worksheet type
|
||||||
|
combined_html = ""
|
||||||
|
for wtype in request.worksheet_types:
|
||||||
|
html = generate_worksheet_html(
|
||||||
|
vocabulary=vocabulary,
|
||||||
|
worksheet_type=wtype,
|
||||||
|
title=f"{title} - {wtype.value}",
|
||||||
|
show_solutions=False,
|
||||||
|
repetitions=request.repetitions,
|
||||||
|
line_height=request.line_height,
|
||||||
|
)
|
||||||
|
combined_html += html + '<div style="page-break-after: always;"></div>'
|
||||||
|
|
||||||
|
# Generate PDF
|
||||||
|
try:
|
||||||
|
pdf_bytes = await generate_worksheet_pdf(combined_html)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
|
||||||
|
|
||||||
|
# Save PDF
|
||||||
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||||
|
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
|
||||||
|
with open(pdf_path, 'wb') as f:
|
||||||
|
f.write(pdf_bytes)
|
||||||
|
|
||||||
|
# Generate solution PDF if requested
|
||||||
|
solution_path = None
|
||||||
|
if request.include_solutions:
|
||||||
|
solution_html = ""
|
||||||
|
for wtype in request.worksheet_types:
|
||||||
|
html = generate_worksheet_html(
|
||||||
|
vocabulary=vocabulary,
|
||||||
|
worksheet_type=wtype,
|
||||||
|
title=f"{title} - {wtype.value} (Loesung)",
|
||||||
|
show_solutions=True,
|
||||||
|
repetitions=request.repetitions,
|
||||||
|
line_height=request.line_height,
|
||||||
|
)
|
||||||
|
solution_html += html + '<div style="page-break-after: always;"></div>'
|
||||||
|
|
||||||
|
solution_bytes = await generate_worksheet_pdf(solution_html)
|
||||||
|
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
|
||||||
|
with open(solution_path, 'wb') as f:
|
||||||
|
f.write(solution_bytes)
|
||||||
|
|
||||||
|
# Store worksheet info
|
||||||
|
worksheet_data = {
|
||||||
|
"id": worksheet_id,
|
||||||
|
"session_id": session_id,
|
||||||
|
"worksheet_types": [wt.value for wt in request.worksheet_types],
|
||||||
|
"pdf_path": pdf_path,
|
||||||
|
"solution_path": solution_path,
|
||||||
|
"generated_at": datetime.utcnow(),
|
||||||
|
}
|
||||||
|
_worksheets[worksheet_id] = worksheet_data
|
||||||
|
|
||||||
|
# Update session status
|
||||||
|
session["status"] = SessionStatus.COMPLETED.value
|
||||||
|
|
||||||
|
return WorksheetResponse(
|
||||||
|
id=worksheet_id,
|
||||||
|
session_id=session_id,
|
||||||
|
worksheet_types=worksheet_data["worksheet_types"],
|
||||||
|
pdf_path=pdf_path,
|
||||||
|
solution_path=solution_path,
|
||||||
|
generated_at=worksheet_data["generated_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/worksheets/{worksheet_id}/pdf")
|
||||||
|
async def download_worksheet_pdf(worksheet_id: str):
|
||||||
|
"""Download the generated worksheet PDF."""
|
||||||
|
if worksheet_id not in _worksheets:
|
||||||
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||||
|
|
||||||
|
worksheet = _worksheets[worksheet_id]
|
||||||
|
pdf_path = worksheet["pdf_path"]
|
||||||
|
|
||||||
|
if not os.path.exists(pdf_path):
|
||||||
|
raise HTTPException(status_code=404, detail="PDF file not found")
|
||||||
|
|
||||||
|
with open(pdf_path, 'rb') as f:
|
||||||
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
io.BytesIO(pdf_bytes),
|
||||||
|
media_type="application/pdf",
|
||||||
|
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/worksheets/{worksheet_id}/solution")
|
||||||
|
async def download_solution_pdf(worksheet_id: str):
|
||||||
|
"""Download the solution PDF."""
|
||||||
|
if worksheet_id not in _worksheets:
|
||||||
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||||
|
|
||||||
|
worksheet = _worksheets[worksheet_id]
|
||||||
|
solution_path = worksheet.get("solution_path")
|
||||||
|
|
||||||
|
if not solution_path or not os.path.exists(solution_path):
|
||||||
|
raise HTTPException(status_code=404, detail="Solution PDF not found")
|
||||||
|
|
||||||
|
with open(solution_path, 'rb') as f:
|
||||||
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
io.BytesIO(pdf_bytes),
|
||||||
|
media_type="application/pdf",
|
||||||
|
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/image")
|
||||||
|
async def get_session_image(session_id: str):
|
||||||
|
"""Get the uploaded source image for a session."""
|
||||||
|
if session_id not in _sessions:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _sessions[session_id]
|
||||||
|
image_path = session.get("image_path")
|
||||||
|
|
||||||
|
if not image_path or not os.path.exists(image_path):
|
||||||
|
raise HTTPException(status_code=404, detail="Image not found")
|
||||||
|
|
||||||
|
# Determine content type
|
||||||
|
extension = image_path.split('.')[-1].lower()
|
||||||
|
content_type = {
|
||||||
|
'png': 'image/png',
|
||||||
|
'jpg': 'image/jpeg',
|
||||||
|
'jpeg': 'image/jpeg',
|
||||||
|
}.get(extension, 'application/octet-stream')
|
||||||
|
|
||||||
|
with open(image_path, 'rb') as f:
|
||||||
|
image_bytes = f.read()
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
io.BytesIO(image_bytes),
|
||||||
|
media_type=content_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/sessions/{session_id}")
|
||||||
|
async def delete_session(session_id: str):
|
||||||
|
"""Delete a vocabulary session and all associated files."""
|
||||||
|
if session_id not in _sessions:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
# Delete session directory
|
||||||
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||||
|
if os.path.exists(session_dir):
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(session_dir)
|
||||||
|
|
||||||
|
# Remove from storage
|
||||||
|
del _sessions[session_id]
|
||||||
|
|
||||||
|
# Remove associated worksheets
|
||||||
|
for wid, ws in list(_worksheets.items()):
|
||||||
|
if ws["session_id"] == session_id:
|
||||||
|
del _worksheets[wid]
|
||||||
|
|
||||||
|
return {"message": "Session deleted successfully", "session_id": session_id}
|
||||||
|
|
||||||
|
|
||||||
|
# --- Include sub-routers ---
|
||||||
|
from .upload_api import upload_router
|
||||||
|
from .analysis_api import analysis_router
|
||||||
|
|
||||||
|
router.include_router(upload_router)
|
||||||
|
router.include_router(analysis_router)
|
||||||
@@ -0,0 +1,542 @@
|
|||||||
|
"""
|
||||||
|
Vocabulary Worksheet Compare & Grid Analysis API.
|
||||||
|
|
||||||
|
Split from vocab_worksheet_analysis_api.py — contains the two largest
|
||||||
|
route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Query
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from .extraction import extract_vocabulary_from_image
|
||||||
|
|
||||||
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||||
|
VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
|
||||||
|
|
||||||
|
def _get_sessions():
|
||||||
|
from .api import _sessions
|
||||||
|
return _sessions
|
||||||
|
from .generation import convert_pdf_page_to_image
|
||||||
|
|
||||||
|
# Try to import Tesseract extractor
|
||||||
|
try:
|
||||||
|
from tesseract_vocab_extractor import (
|
||||||
|
run_tesseract_pipeline,
|
||||||
|
match_positions_to_vocab, TESSERACT_AVAILABLE,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
TESSERACT_AVAILABLE = False
|
||||||
|
|
||||||
|
# Try to import CV Pipeline
|
||||||
|
try:
|
||||||
|
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
|
||||||
|
except ImportError:
|
||||||
|
CV_PIPELINE_AVAILABLE = False
|
||||||
|
|
||||||
|
# Try to import Grid Detection Service
|
||||||
|
try:
|
||||||
|
from services.grid_detection_service import GridDetectionService
|
||||||
|
GRID_SERVICE_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
GRID_SERVICE_AVAILABLE = False
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
compare_router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# OCR Compare & Grid Analysis Endpoints
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
|
||||||
|
async def compare_ocr_methods(session_id: str, page_number: int):
|
||||||
|
"""
|
||||||
|
Run multiple OCR methods on a page and compare results.
|
||||||
|
|
||||||
|
This endpoint:
|
||||||
|
1. Gets the page image from the session's uploaded PDF
|
||||||
|
2. Runs Vision LLM extraction (primary method)
|
||||||
|
3. Optionally runs Tesseract extraction
|
||||||
|
4. Compares found vocabulary across methods
|
||||||
|
5. Returns structured comparison results
|
||||||
|
|
||||||
|
page_number is 0-indexed.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
|
||||||
|
logger.info(f"Compare OCR for session {session_id}, page {page_number}")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
page_count = session.get("pdf_page_count", 1)
|
||||||
|
if page_number < 0 or page_number >= page_count:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||||
|
|
||||||
|
# Convert page to image
|
||||||
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
|
|
||||||
|
methods_results = {}
|
||||||
|
all_vocab_sets = {}
|
||||||
|
|
||||||
|
# --- Method: Vision LLM ---
|
||||||
|
try:
|
||||||
|
start = time.time()
|
||||||
|
vocab, confidence, error = await extract_vocabulary_from_image(
|
||||||
|
image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
|
||||||
|
)
|
||||||
|
duration = time.time() - start
|
||||||
|
|
||||||
|
vocab_list = []
|
||||||
|
for v in vocab:
|
||||||
|
entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
|
||||||
|
vocab_list.append({
|
||||||
|
"english": entry.get("english", ""),
|
||||||
|
"german": entry.get("german", ""),
|
||||||
|
"example": entry.get("example_sentence", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
methods_results["vision_llm"] = {
|
||||||
|
"name": "Vision LLM",
|
||||||
|
"model": VISION_MODEL,
|
||||||
|
"duration_seconds": round(duration, 1),
|
||||||
|
"vocabulary_count": len(vocab_list),
|
||||||
|
"vocabulary": vocab_list,
|
||||||
|
"confidence": confidence,
|
||||||
|
"success": len(vocab_list) > 0 and not error,
|
||||||
|
"error": error if error else None,
|
||||||
|
}
|
||||||
|
all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Vision LLM failed: {e}")
|
||||||
|
methods_results["vision_llm"] = {
|
||||||
|
"name": "Vision LLM",
|
||||||
|
"model": VISION_MODEL,
|
||||||
|
"duration_seconds": 0,
|
||||||
|
"vocabulary_count": 0,
|
||||||
|
"vocabulary": [],
|
||||||
|
"confidence": 0,
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
all_vocab_sets["vision_llm"] = set()
|
||||||
|
|
||||||
|
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
|
||||||
|
if TESSERACT_AVAILABLE:
|
||||||
|
try:
|
||||||
|
start = time.time()
|
||||||
|
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
|
||||||
|
duration = time.time() - start
|
||||||
|
|
||||||
|
tess_vocab = tess_result.get("vocabulary", [])
|
||||||
|
tess_words = tess_result.get("words", [])
|
||||||
|
|
||||||
|
# Store Tesseract words in session for later use (grid analysis, position matching)
|
||||||
|
session["tesseract_words"] = tess_words
|
||||||
|
session["tesseract_image_width"] = tess_result.get("image_width", 0)
|
||||||
|
session["tesseract_image_height"] = tess_result.get("image_height", 0)
|
||||||
|
session[f"tesseract_page_{page_number}"] = tess_result
|
||||||
|
|
||||||
|
vocab_list_tess = []
|
||||||
|
for v in tess_vocab:
|
||||||
|
vocab_list_tess.append({
|
||||||
|
"english": v.get("english", ""),
|
||||||
|
"german": v.get("german", ""),
|
||||||
|
"example": v.get("example", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
methods_results["tesseract"] = {
|
||||||
|
"name": "Tesseract OCR",
|
||||||
|
"model": "tesseract-ocr (eng+deu)",
|
||||||
|
"duration_seconds": round(duration, 1),
|
||||||
|
"vocabulary_count": len(vocab_list_tess),
|
||||||
|
"vocabulary": vocab_list_tess,
|
||||||
|
"confidence": 0.7 if tess_vocab else 0,
|
||||||
|
"success": len(vocab_list_tess) > 0,
|
||||||
|
"error": tess_result.get("error"),
|
||||||
|
"word_count": tess_result.get("word_count", 0),
|
||||||
|
"columns_detected": len(tess_result.get("columns", [])),
|
||||||
|
}
|
||||||
|
all_vocab_sets["tesseract"] = {
|
||||||
|
(v["english"].lower().strip(), v["german"].lower().strip())
|
||||||
|
for v in vocab_list_tess if v["english"] and v["german"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
|
||||||
|
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
|
||||||
|
llm_vocab_with_bbox = match_positions_to_vocab(
|
||||||
|
tess_words,
|
||||||
|
methods_results["vision_llm"]["vocabulary"],
|
||||||
|
tess_result.get("image_width", 1),
|
||||||
|
tess_result.get("image_height", 1),
|
||||||
|
)
|
||||||
|
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Tesseract failed: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
methods_results["tesseract"] = {
|
||||||
|
"name": "Tesseract OCR",
|
||||||
|
"model": "tesseract-ocr",
|
||||||
|
"duration_seconds": 0,
|
||||||
|
"vocabulary_count": 0,
|
||||||
|
"vocabulary": [],
|
||||||
|
"confidence": 0,
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
all_vocab_sets["tesseract"] = set()
|
||||||
|
|
||||||
|
# --- Method: CV Pipeline (Document Reconstruction) ---
|
||||||
|
if CV_PIPELINE_AVAILABLE:
|
||||||
|
try:
|
||||||
|
start = time.time()
|
||||||
|
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
|
||||||
|
duration = time.time() - start
|
||||||
|
|
||||||
|
cv_vocab = cv_result.vocabulary if not cv_result.error else []
|
||||||
|
vocab_list_cv = []
|
||||||
|
for v in cv_vocab:
|
||||||
|
vocab_list_cv.append({
|
||||||
|
"english": v.get("english", ""),
|
||||||
|
"german": v.get("german", ""),
|
||||||
|
"example": v.get("example", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
methods_results["cv_pipeline"] = {
|
||||||
|
"name": "CV Pipeline (Document Reconstruction)",
|
||||||
|
"model": "opencv + tesseract (multi-pass)",
|
||||||
|
"duration_seconds": round(duration, 1),
|
||||||
|
"vocabulary_count": len(vocab_list_cv),
|
||||||
|
"vocabulary": vocab_list_cv,
|
||||||
|
"confidence": 0.8 if cv_vocab else 0,
|
||||||
|
"success": len(vocab_list_cv) > 0,
|
||||||
|
"error": cv_result.error,
|
||||||
|
"word_count": cv_result.word_count,
|
||||||
|
"columns_detected": cv_result.columns_detected,
|
||||||
|
"stages": cv_result.stages,
|
||||||
|
}
|
||||||
|
all_vocab_sets["cv_pipeline"] = {
|
||||||
|
(v["english"].lower().strip(), v["german"].lower().strip())
|
||||||
|
for v in vocab_list_cv if v["english"] and v["german"]
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"CV Pipeline failed: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
methods_results["cv_pipeline"] = {
|
||||||
|
"name": "CV Pipeline (Document Reconstruction)",
|
||||||
|
"model": "opencv + tesseract (multi-pass)",
|
||||||
|
"duration_seconds": 0,
|
||||||
|
"vocabulary_count": 0,
|
||||||
|
"vocabulary": [],
|
||||||
|
"confidence": 0,
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
all_vocab_sets["cv_pipeline"] = set()
|
||||||
|
|
||||||
|
# --- Build comparison ---
|
||||||
|
all_unique = set()
|
||||||
|
for vs in all_vocab_sets.values():
|
||||||
|
all_unique |= vs
|
||||||
|
|
||||||
|
found_by_all = []
|
||||||
|
found_by_some = []
|
||||||
|
for english, german in sorted(all_unique):
|
||||||
|
found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
|
||||||
|
entry = {"english": english, "german": german, "methods": found_in}
|
||||||
|
if len(found_in) == len(all_vocab_sets):
|
||||||
|
found_by_all.append(entry)
|
||||||
|
else:
|
||||||
|
found_by_some.append(entry)
|
||||||
|
|
||||||
|
total_methods = max(len(all_vocab_sets), 1)
|
||||||
|
agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
|
||||||
|
|
||||||
|
# Find best method
|
||||||
|
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_number": page_number,
|
||||||
|
"methods": methods_results,
|
||||||
|
"comparison": {
|
||||||
|
"found_by_all_methods": found_by_all,
|
||||||
|
"found_by_some_methods": found_by_some,
|
||||||
|
"total_unique_vocabulary": len(all_unique),
|
||||||
|
"agreement_rate": agreement_rate,
|
||||||
|
},
|
||||||
|
"recommendation": {
|
||||||
|
"best_method": best_method,
|
||||||
|
"reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
|
||||||
|
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
|
||||||
|
"""
|
||||||
|
Analyze the grid/table structure of a vocabulary page.
|
||||||
|
|
||||||
|
Hybrid approach:
|
||||||
|
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
|
||||||
|
real spatial positions via GridDetectionService.
|
||||||
|
2. Otherwise fall back to Vision LLM for grid structure detection.
|
||||||
|
|
||||||
|
page_number is 0-indexed.
|
||||||
|
Returns GridData structure expected by the frontend GridOverlay component.
|
||||||
|
"""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
page_count = session.get("pdf_page_count", 1)
|
||||||
|
if page_number < 0 or page_number >= page_count:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid page number.")
|
||||||
|
|
||||||
|
# Convert page to image
|
||||||
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
|
|
||||||
|
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
|
||||||
|
tess_page_data = session.get(f"tesseract_page_{page_number}")
|
||||||
|
|
||||||
|
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
|
||||||
|
try:
|
||||||
|
# Run Tesseract if not already cached
|
||||||
|
if not tess_page_data:
|
||||||
|
logger.info("Running Tesseract for grid analysis (not cached)")
|
||||||
|
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
|
||||||
|
tess_page_data = await _run_tess(image_data, lang="eng+deu")
|
||||||
|
session[f"tesseract_page_{page_number}"] = tess_page_data
|
||||||
|
session["tesseract_words"] = tess_page_data.get("words", [])
|
||||||
|
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
|
||||||
|
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
|
||||||
|
|
||||||
|
tess_words = tess_page_data.get("words", [])
|
||||||
|
img_w = tess_page_data.get("image_width", 0)
|
||||||
|
img_h = tess_page_data.get("image_height", 0)
|
||||||
|
|
||||||
|
if tess_words and img_w > 0 and img_h > 0:
|
||||||
|
service = GridDetectionService()
|
||||||
|
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
|
||||||
|
|
||||||
|
if regions:
|
||||||
|
grid_result = service.detect_grid(regions)
|
||||||
|
grid_dict = grid_result.to_dict()
|
||||||
|
|
||||||
|
# Merge LLM text if available (better quality than Tesseract text)
|
||||||
|
# The LLM vocab was stored during compare-ocr
|
||||||
|
grid_dict["source"] = "tesseract+grid_service"
|
||||||
|
grid_dict["word_count"] = len(tess_words)
|
||||||
|
|
||||||
|
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
|
||||||
|
f"{grid_result.stats.get('recognized', 0)} recognized")
|
||||||
|
|
||||||
|
return {"success": True, "grid": grid_dict}
|
||||||
|
|
||||||
|
logger.info("Tesseract data insufficient, falling back to LLM")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
|
||||||
|
# --- Strategy 2: Fall back to Vision LLM ---
|
||||||
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||||
|
|
||||||
|
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
|
||||||
|
|
||||||
|
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
|
||||||
|
|
||||||
|
Return a JSON object with this EXACT structure:
|
||||||
|
{
|
||||||
|
"rows": <number of rows>,
|
||||||
|
"columns": <number of columns>,
|
||||||
|
"column_types": ["english", "german", "example"],
|
||||||
|
"entries": [
|
||||||
|
{
|
||||||
|
"row": 0,
|
||||||
|
"col": 0,
|
||||||
|
"text": "the word or phrase in this cell",
|
||||||
|
"column_type": "english",
|
||||||
|
"confidence": 0.95
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- row and col are 0-indexed
|
||||||
|
- column_type is one of: "english", "german", "example", "unknown"
|
||||||
|
- Detect whether each column contains English words, German translations, or example sentences
|
||||||
|
- Include ALL non-empty cells
|
||||||
|
- confidence is 0.0-1.0 based on how clear the text is
|
||||||
|
- If a cell is empty, don't include it
|
||||||
|
- Return ONLY the JSON, no other text"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
raw_text = ""
|
||||||
|
max_retries = 3
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
|
response = await client.post(
|
||||||
|
f"{OLLAMA_URL}/api/chat",
|
||||||
|
json={
|
||||||
|
"model": VISION_MODEL,
|
||||||
|
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.1, "num_predict": 8192},
|
||||||
|
},
|
||||||
|
timeout=300.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 500 and attempt < max_retries - 1:
|
||||||
|
wait_time = 10 * (attempt + 1)
|
||||||
|
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
elif response.status_code != 200:
|
||||||
|
error_detail = response.text[:200] if response.text else "Unknown error"
|
||||||
|
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
|
||||||
|
|
||||||
|
raw_text = response.json().get("message", {}).get("content", "")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Parse JSON from response
|
||||||
|
import re
|
||||||
|
json_match = re.search(r'\{[\s\S]*\}', raw_text)
|
||||||
|
if not json_match:
|
||||||
|
return {"success": False, "error": "Could not parse grid structure from LLM response"}
|
||||||
|
|
||||||
|
grid_raw = json.loads(json_match.group())
|
||||||
|
|
||||||
|
num_rows = grid_raw.get("rows", 0)
|
||||||
|
num_cols = grid_raw.get("columns", 0)
|
||||||
|
column_types = grid_raw.get("column_types", [])
|
||||||
|
entries = grid_raw.get("entries", [])
|
||||||
|
|
||||||
|
if num_rows == 0 or num_cols == 0:
|
||||||
|
return {"success": False, "error": "No grid structure detected"}
|
||||||
|
|
||||||
|
# Ensure column_types has the right length
|
||||||
|
while len(column_types) < num_cols:
|
||||||
|
column_types.append("unknown")
|
||||||
|
|
||||||
|
# Build cell grid with percentage-based coordinates
|
||||||
|
row_height = 100.0 / num_rows
|
||||||
|
col_width = 100.0 / num_cols
|
||||||
|
|
||||||
|
# Track which cells have content
|
||||||
|
cell_map = {}
|
||||||
|
for entry in entries:
|
||||||
|
r = entry.get("row", 0)
|
||||||
|
c = entry.get("col", 0)
|
||||||
|
cell_map[(r, c)] = entry
|
||||||
|
|
||||||
|
cells = []
|
||||||
|
recognized_count = 0
|
||||||
|
empty_count = 0
|
||||||
|
problematic_count = 0
|
||||||
|
|
||||||
|
for r in range(num_rows):
|
||||||
|
row_cells = []
|
||||||
|
for c in range(num_cols):
|
||||||
|
x = c * col_width
|
||||||
|
y = r * row_height
|
||||||
|
|
||||||
|
if (r, c) in cell_map:
|
||||||
|
entry = cell_map[(r, c)]
|
||||||
|
text = entry.get("text", "").strip()
|
||||||
|
conf = entry.get("confidence", 0.8)
|
||||||
|
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
|
||||||
|
|
||||||
|
if text:
|
||||||
|
status = "recognized" if conf >= 0.5 else "problematic"
|
||||||
|
if status == "recognized":
|
||||||
|
recognized_count += 1
|
||||||
|
else:
|
||||||
|
problematic_count += 1
|
||||||
|
else:
|
||||||
|
status = "empty"
|
||||||
|
empty_count += 1
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
conf = 0.0
|
||||||
|
col_type = column_types[c] if c < len(column_types) else "unknown"
|
||||||
|
status = "empty"
|
||||||
|
empty_count += 1
|
||||||
|
|
||||||
|
row_cells.append({
|
||||||
|
"row": r,
|
||||||
|
"col": c,
|
||||||
|
"x": round(x, 2),
|
||||||
|
"y": round(y, 2),
|
||||||
|
"width": round(col_width, 2),
|
||||||
|
"height": round(row_height, 2),
|
||||||
|
"text": text,
|
||||||
|
"confidence": conf,
|
||||||
|
"status": status,
|
||||||
|
"column_type": col_type,
|
||||||
|
})
|
||||||
|
cells.append(row_cells)
|
||||||
|
|
||||||
|
total = num_rows * num_cols
|
||||||
|
coverage = (recognized_count + problematic_count) / max(total, 1)
|
||||||
|
|
||||||
|
# Column and row boundaries as percentages
|
||||||
|
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
|
||||||
|
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
|
||||||
|
|
||||||
|
grid_data = {
|
||||||
|
"rows": num_rows,
|
||||||
|
"columns": num_cols,
|
||||||
|
"cells": cells,
|
||||||
|
"column_types": column_types,
|
||||||
|
"column_boundaries": col_boundaries,
|
||||||
|
"row_boundaries": row_boundaries,
|
||||||
|
"deskew_angle": 0.0,
|
||||||
|
"source": "vision_llm",
|
||||||
|
"stats": {
|
||||||
|
"recognized": recognized_count,
|
||||||
|
"problematic": problematic_count,
|
||||||
|
"empty": empty_count,
|
||||||
|
"manual": 0,
|
||||||
|
"total": total,
|
||||||
|
"coverage": round(coverage, 3),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return {"success": True, "grid": grid_data}
|
||||||
|
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
logger.error("Grid analysis timed out")
|
||||||
|
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Grid analysis failed: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
|
||||||
@@ -0,0 +1,325 @@
|
|||||||
|
"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
|
||||||
|
|
||||||
|
Contains:
|
||||||
|
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
|
||||||
|
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
|
||||||
|
- _get_demo_vocabulary(): Demo data for testing
|
||||||
|
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
|
||||||
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from .models import VocabularyEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Ollama Configuration
|
||||||
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||||
|
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Vision LLM Vocabulary Extraction
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
|
||||||
|
|
||||||
|
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
|
||||||
|
|
||||||
|
{
|
||||||
|
"vocabulary": [
|
||||||
|
{
|
||||||
|
"english": "to improve",
|
||||||
|
"german": "verbessern",
|
||||||
|
"example": "I want to improve my English."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
REGELN:
|
||||||
|
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
|
||||||
|
2. Behalte die exakte Schreibweise bei
|
||||||
|
3. Bei fehlenden Beispielsaetzen: "example": null
|
||||||
|
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
|
||||||
|
5. Gib NUR valides JSON zurueck, keine Erklaerungen
|
||||||
|
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
|
||||||
|
|
||||||
|
Beispiel-Output:
|
||||||
|
{
|
||||||
|
"vocabulary": [
|
||||||
|
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
|
||||||
|
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
|
||||||
|
]
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_vocabulary_from_image(
|
||||||
|
image_data: bytes,
|
||||||
|
filename: str,
|
||||||
|
page_number: int = 0,
|
||||||
|
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
|
||||||
|
) -> tuple[List[VocabularyEntry], float, str]:
|
||||||
|
"""
|
||||||
|
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_data: Image bytes
|
||||||
|
filename: Original filename for logging
|
||||||
|
page_number: 0-indexed page number for error messages
|
||||||
|
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
|
||||||
|
If False, use Vision LLM (slower, better for complex layouts)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (vocabulary_entries, confidence, error_message)
|
||||||
|
error_message is empty string on success
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ==========================================================================
|
||||||
|
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
|
||||||
|
# ==========================================================================
|
||||||
|
if use_hybrid:
|
||||||
|
try:
|
||||||
|
from hybrid_vocab_extractor import extract_vocabulary_hybrid
|
||||||
|
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
|
||||||
|
|
||||||
|
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
logger.warning(f"Hybrid extraction had issues: {error}")
|
||||||
|
# Fall through to Vision LLM fallback
|
||||||
|
elif vocab_dicts:
|
||||||
|
# Convert dicts to VocabularyEntry objects
|
||||||
|
vocabulary = [
|
||||||
|
VocabularyEntry(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
english=v.get("english", ""),
|
||||||
|
german=v.get("german", ""),
|
||||||
|
example_sentence=v.get("example"),
|
||||||
|
source_page=page_number + 1
|
||||||
|
)
|
||||||
|
for v in vocab_dicts
|
||||||
|
if v.get("english") and v.get("german")
|
||||||
|
]
|
||||||
|
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
|
||||||
|
return vocabulary, confidence, ""
|
||||||
|
|
||||||
|
except ImportError as e:
|
||||||
|
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
|
||||||
|
import traceback
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
|
||||||
|
# ==========================================================================
|
||||||
|
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
|
||||||
|
# ==========================================================================
|
||||||
|
logger.info(f"Using VISION LLM extraction for {filename}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# First check if Ollama is available
|
||||||
|
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
||||||
|
try:
|
||||||
|
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
||||||
|
if health_response.status_code != 200:
|
||||||
|
logger.error(f"Ollama not available at {OLLAMA_URL}")
|
||||||
|
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Ollama health check failed: {e}")
|
||||||
|
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
|
||||||
|
|
||||||
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": VISION_MODEL,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": VOCAB_EXTRACTION_PROMPT,
|
||||||
|
"images": [image_base64]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"stream": False,
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.1,
|
||||||
|
"num_predict": 4096,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
|
||||||
|
|
||||||
|
# Increased timeout for Vision models (they can be slow)
|
||||||
|
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||||
|
response = await client.post(
|
||||||
|
f"{OLLAMA_URL}/api/chat",
|
||||||
|
json=payload,
|
||||||
|
timeout=300.0 # 5 minutes per page
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
extracted_text = data.get("message", {}).get("content", "")
|
||||||
|
|
||||||
|
logger.info(f"Ollama response received: {len(extracted_text)} chars")
|
||||||
|
|
||||||
|
# Parse JSON from response
|
||||||
|
vocabulary = parse_vocabulary_json(extracted_text)
|
||||||
|
|
||||||
|
# Set source_page for each entry
|
||||||
|
for v in vocabulary:
|
||||||
|
v.source_page = page_number + 1
|
||||||
|
|
||||||
|
# Estimate confidence
|
||||||
|
confidence = 0.85 if len(vocabulary) > 0 else 0.1
|
||||||
|
|
||||||
|
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
|
||||||
|
|
||||||
|
return vocabulary, confidence, ""
|
||||||
|
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
|
||||||
|
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_demo_vocabulary() -> List[VocabularyEntry]:
|
||||||
|
"""Return demo vocabulary for testing when Vision LLM is not available."""
|
||||||
|
demo_entries = [
|
||||||
|
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
|
||||||
|
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
|
||||||
|
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
|
||||||
|
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
|
||||||
|
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
|
||||||
|
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
|
||||||
|
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
|
||||||
|
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
|
||||||
|
]
|
||||||
|
return [
|
||||||
|
VocabularyEntry(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
english=e["english"],
|
||||||
|
german=e["german"],
|
||||||
|
example_sentence=e.get("example"),
|
||||||
|
)
|
||||||
|
for e in demo_entries
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
|
||||||
|
"""Parse vocabulary JSON from LLM response with robust error handling."""
|
||||||
|
|
||||||
|
def clean_json_string(s: str) -> str:
|
||||||
|
"""Clean a JSON string by removing control characters and fixing common issues."""
|
||||||
|
# Remove control characters except newlines and tabs
|
||||||
|
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||||
|
# Replace unescaped newlines within strings with space
|
||||||
|
# This is a simplistic approach - replace actual newlines with escaped ones
|
||||||
|
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
||||||
|
return s
|
||||||
|
|
||||||
|
def try_parse_json(json_str: str) -> dict:
|
||||||
|
"""Try multiple strategies to parse JSON."""
|
||||||
|
# Strategy 1: Direct parse
|
||||||
|
try:
|
||||||
|
return json.loads(json_str)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Strategy 2: Clean and parse
|
||||||
|
try:
|
||||||
|
cleaned = clean_json_string(json_str)
|
||||||
|
return json.loads(cleaned)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Strategy 3: Try to fix common issues
|
||||||
|
try:
|
||||||
|
# Remove trailing commas before } or ]
|
||||||
|
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
||||||
|
# Fix unquoted keys
|
||||||
|
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
|
||||||
|
return json.loads(fixed)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Find JSON in response (may have extra text)
|
||||||
|
start = text.find('{')
|
||||||
|
end = text.rfind('}') + 1
|
||||||
|
|
||||||
|
if start == -1 or end == 0:
|
||||||
|
logger.warning("No JSON found in response")
|
||||||
|
return []
|
||||||
|
|
||||||
|
json_str = text[start:end]
|
||||||
|
data = try_parse_json(json_str)
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
# Strategy 4: Extract vocabulary entries using regex as fallback
|
||||||
|
logger.warning("JSON parsing failed, trying regex extraction")
|
||||||
|
vocabulary = []
|
||||||
|
# Match patterns like {"english": "...", "german": "...", ...}
|
||||||
|
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
|
||||||
|
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
english = match[0].strip() if match[0] else ""
|
||||||
|
german = match[1].strip() if match[1] else ""
|
||||||
|
example = match[2].strip() if len(match) > 2 and match[2] else None
|
||||||
|
|
||||||
|
if english and german:
|
||||||
|
vocab_entry = VocabularyEntry(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
english=english,
|
||||||
|
german=german,
|
||||||
|
example_sentence=example,
|
||||||
|
)
|
||||||
|
vocabulary.append(vocab_entry)
|
||||||
|
|
||||||
|
if vocabulary:
|
||||||
|
logger.info(f"Regex extraction found {len(vocabulary)} entries")
|
||||||
|
return vocabulary
|
||||||
|
|
||||||
|
# Normal JSON parsing succeeded
|
||||||
|
vocabulary = []
|
||||||
|
for i, entry in enumerate(data.get("vocabulary", [])):
|
||||||
|
english = entry.get("english", "").strip()
|
||||||
|
german = entry.get("german", "").strip()
|
||||||
|
|
||||||
|
# Skip entries that look like hallucinations (very long or containing unusual patterns)
|
||||||
|
if len(english) > 100 or len(german) > 200:
|
||||||
|
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not english or not german:
|
||||||
|
continue
|
||||||
|
|
||||||
|
vocab_entry = VocabularyEntry(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
english=english,
|
||||||
|
german=german,
|
||||||
|
example_sentence=entry.get("example"),
|
||||||
|
word_type=entry.get("word_type"),
|
||||||
|
)
|
||||||
|
vocabulary.append(vocab_entry)
|
||||||
|
|
||||||
|
return vocabulary
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to parse vocabulary JSON: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return []
|
||||||
@@ -0,0 +1,258 @@
|
|||||||
|
"""
|
||||||
|
Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
|
||||||
|
|
||||||
|
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
|
||||||
|
|
||||||
|
Functions:
|
||||||
|
- generate_worksheet_html(): Build HTML for various worksheet types
|
||||||
|
- generate_worksheet_pdf(): Convert HTML to PDF via WeasyPrint
|
||||||
|
- get_pdf_page_count(): Count pages in a PDF (PyMuPDF)
|
||||||
|
- convert_pdf_page_to_image(): Render single PDF page to PNG
|
||||||
|
- convert_pdf_to_images(): Render multiple PDF pages to PNG
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
from .models import VocabularyEntry, WorksheetType
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Optional dependency: WeasyPrint
|
||||||
|
try:
|
||||||
|
from weasyprint import HTML as _WeasyHTML
|
||||||
|
WEASYPRINT_AVAILABLE = True
|
||||||
|
except (ImportError, OSError):
|
||||||
|
WEASYPRINT_AVAILABLE = False
|
||||||
|
logger.warning("WeasyPrint not available")
|
||||||
|
|
||||||
|
# Optional dependency: PyMuPDF
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
FITZ_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
FITZ_AVAILABLE = False
|
||||||
|
logger.warning("PyMuPDF (fitz) not available")
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Worksheet HTML Generation
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def generate_worksheet_html(
|
||||||
|
vocabulary: List[VocabularyEntry],
|
||||||
|
worksheet_type: WorksheetType,
|
||||||
|
title: str,
|
||||||
|
show_solutions: bool = False,
|
||||||
|
repetitions: int = 3,
|
||||||
|
line_height: str = "normal"
|
||||||
|
) -> str:
|
||||||
|
"""Generate HTML for a worksheet."""
|
||||||
|
|
||||||
|
# Line height CSS
|
||||||
|
line_heights = {
|
||||||
|
"normal": "2.5em",
|
||||||
|
"large": "3.5em",
|
||||||
|
"extra-large": "4.5em"
|
||||||
|
}
|
||||||
|
lh = line_heights.get(line_height, "2.5em")
|
||||||
|
|
||||||
|
html = f"""<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<style>
|
||||||
|
@page {{ size: A4; margin: 2cm; }}
|
||||||
|
body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
|
||||||
|
h1 {{ font-size: 24px; margin-bottom: 10px; }}
|
||||||
|
.meta {{ color: #666; margin-bottom: 20px; }}
|
||||||
|
.name-line {{ margin-bottom: 30px; }}
|
||||||
|
.vocab-table {{ width: 100%; border-collapse: collapse; }}
|
||||||
|
.vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
|
||||||
|
.vocab-word {{ width: 40%; font-weight: 500; }}
|
||||||
|
.vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
|
||||||
|
.vocab-answer {{ width: 60%; color: #2563eb; }}
|
||||||
|
.gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
|
||||||
|
.hint {{ color: #666; font-style: italic; font-size: 12px; }}
|
||||||
|
.section {{ margin-top: 30px; }}
|
||||||
|
.section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>{title}</h1>
|
||||||
|
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
if worksheet_type == WorksheetType.EN_TO_DE:
|
||||||
|
html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
|
||||||
|
html += '<table class="vocab-table">'
|
||||||
|
for entry in vocabulary:
|
||||||
|
if show_solutions:
|
||||||
|
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
|
||||||
|
else:
|
||||||
|
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
|
||||||
|
html += '</table></div>'
|
||||||
|
|
||||||
|
elif worksheet_type == WorksheetType.DE_TO_EN:
|
||||||
|
html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
|
||||||
|
html += '<table class="vocab-table">'
|
||||||
|
for entry in vocabulary:
|
||||||
|
if show_solutions:
|
||||||
|
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
|
||||||
|
else:
|
||||||
|
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
|
||||||
|
html += '</table></div>'
|
||||||
|
|
||||||
|
elif worksheet_type == WorksheetType.COPY_PRACTICE:
|
||||||
|
html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
|
||||||
|
html += '<table class="vocab-table">'
|
||||||
|
for entry in vocabulary:
|
||||||
|
html += f'<tr><td class="vocab-word">{entry.english}</td>'
|
||||||
|
html += '<td class="vocab-blank">'
|
||||||
|
if show_solutions:
|
||||||
|
html += f' {entry.english} ' * repetitions
|
||||||
|
html += '</td></tr>'
|
||||||
|
html += '</table></div>'
|
||||||
|
|
||||||
|
elif worksheet_type == WorksheetType.GAP_FILL:
|
||||||
|
entries_with_examples = [e for e in vocabulary if e.example_sentence]
|
||||||
|
if entries_with_examples:
|
||||||
|
html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
|
||||||
|
for i, entry in enumerate(entries_with_examples, 1):
|
||||||
|
# Create gap sentence by removing the English word
|
||||||
|
gap_sentence = entry.example_sentence
|
||||||
|
for word in entry.english.split():
|
||||||
|
if word.lower() in gap_sentence.lower():
|
||||||
|
gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
|
||||||
|
gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
|
||||||
|
gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
|
||||||
|
break
|
||||||
|
|
||||||
|
html += f'<p>{i}. {gap_sentence}</p>'
|
||||||
|
if show_solutions:
|
||||||
|
html += f'<p class="hint">Loesung: {entry.english}</p>'
|
||||||
|
else:
|
||||||
|
html += f'<p class="hint">({entry.german})</p>'
|
||||||
|
html += '</div>'
|
||||||
|
|
||||||
|
html += '</body></html>'
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Worksheet PDF Generation
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
async def generate_worksheet_pdf(html: str) -> bytes:
|
||||||
|
"""Generate PDF from HTML using WeasyPrint."""
|
||||||
|
try:
|
||||||
|
from weasyprint import HTML
|
||||||
|
pdf_bytes = HTML(string=html).write_pdf()
|
||||||
|
return pdf_bytes
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("WeasyPrint not available, returning HTML")
|
||||||
|
return html.encode('utf-8')
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF generation failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PDF Utilities (PyMuPDF)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def get_pdf_page_count(pdf_data: bytes) -> int:
|
||||||
|
"""Get the number of pages in a PDF."""
|
||||||
|
try:
|
||||||
|
import fitz
|
||||||
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||||
|
count = pdf_document.page_count
|
||||||
|
pdf_document.close()
|
||||||
|
return count
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get PDF page count: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
|
||||||
|
"""Convert a specific page of PDF to PNG image using PyMuPDF.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_data: PDF file as bytes
|
||||||
|
page_number: 0-indexed page number
|
||||||
|
thumbnail: If True, return a smaller thumbnail image
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
|
||||||
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||||
|
|
||||||
|
if pdf_document.page_count == 0:
|
||||||
|
raise ValueError("PDF has no pages")
|
||||||
|
|
||||||
|
if page_number >= pdf_document.page_count:
|
||||||
|
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
|
||||||
|
|
||||||
|
page = pdf_document[page_number]
|
||||||
|
|
||||||
|
# Render page to image
|
||||||
|
# For thumbnails: lower resolution, for OCR: higher resolution
|
||||||
|
zoom = 0.5 if thumbnail else 2.0
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
|
||||||
|
png_data = pix.tobytes("png")
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
|
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
|
||||||
|
return png_data
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.error("PyMuPDF (fitz) not installed")
|
||||||
|
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF conversion failed: {e}")
|
||||||
|
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
|
||||||
|
"""Convert multiple pages of PDF to PNG images.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_data: PDF file as bytes
|
||||||
|
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||||
|
|
||||||
|
if pdf_document.page_count == 0:
|
||||||
|
raise ValueError("PDF has no pages")
|
||||||
|
|
||||||
|
# If no pages specified, convert all
|
||||||
|
if pages is None:
|
||||||
|
pages = list(range(pdf_document.page_count))
|
||||||
|
|
||||||
|
images = []
|
||||||
|
zoom = 2.0
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
|
||||||
|
for page_num in pages:
|
||||||
|
if page_num < pdf_document.page_count:
|
||||||
|
page = pdf_document[page_num]
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
images.append(pix.tobytes("png"))
|
||||||
|
|
||||||
|
pdf_document.close()
|
||||||
|
logger.info(f"Converted {len(images)} PDF pages to images")
|
||||||
|
return images
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.error("PyMuPDF (fitz) not installed")
|
||||||
|
raise HTTPException(status_code=500, detail="PDF conversion not available")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF conversion failed: {e}")
|
||||||
|
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
"""Pydantic models and enums for the Vocab Worksheet API."""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Enums
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class WorksheetType(str, Enum):
|
||||||
|
EN_TO_DE = "en_to_de" # English -> German translation
|
||||||
|
DE_TO_EN = "de_to_en" # German -> English translation
|
||||||
|
COPY_PRACTICE = "copy" # Write word multiple times
|
||||||
|
GAP_FILL = "gap_fill" # Fill in the blanks
|
||||||
|
COMBINED = "combined" # All types combined
|
||||||
|
|
||||||
|
|
||||||
|
class SessionStatus(str, Enum):
|
||||||
|
PENDING = "pending" # Session created, no upload yet
|
||||||
|
PROCESSING = "processing" # OCR in progress
|
||||||
|
EXTRACTED = "extracted" # Vocabulary extracted, ready to edit
|
||||||
|
COMPLETED = "completed" # Worksheet generated
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Pydantic Models
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class VocabularyEntry(BaseModel):
|
||||||
|
id: str
|
||||||
|
english: str
|
||||||
|
german: str
|
||||||
|
example_sentence: Optional[str] = None
|
||||||
|
example_sentence_gap: Optional[str] = None # With ___ for gap-fill
|
||||||
|
word_type: Optional[str] = None # noun, verb, adjective, etc.
|
||||||
|
source_page: Optional[int] = None # Page number where entry was found (1-indexed)
|
||||||
|
|
||||||
|
|
||||||
|
class SessionCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
description: Optional[str] = None
|
||||||
|
source_language: str = "en" # Source language (default English)
|
||||||
|
target_language: str = "de" # Target language (default German)
|
||||||
|
|
||||||
|
|
||||||
|
class SessionResponse(BaseModel):
|
||||||
|
id: str
|
||||||
|
name: str
|
||||||
|
description: Optional[str]
|
||||||
|
source_language: str
|
||||||
|
target_language: str
|
||||||
|
status: str
|
||||||
|
vocabulary_count: int
|
||||||
|
image_path: Optional[str]
|
||||||
|
created_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class VocabularyResponse(BaseModel):
|
||||||
|
session_id: str
|
||||||
|
vocabulary: List[VocabularyEntry]
|
||||||
|
extraction_confidence: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
|
class VocabularyUpdate(BaseModel):
|
||||||
|
vocabulary: List[VocabularyEntry]
|
||||||
|
|
||||||
|
|
||||||
|
class WorksheetGenerateRequest(BaseModel):
|
||||||
|
worksheet_types: List[WorksheetType]
|
||||||
|
title: Optional[str] = None
|
||||||
|
include_solutions: bool = True
|
||||||
|
repetitions: int = 3 # For copy practice
|
||||||
|
line_height: str = "normal" # normal, large, extra-large
|
||||||
|
|
||||||
|
|
||||||
|
class WorksheetResponse(BaseModel):
|
||||||
|
id: str
|
||||||
|
session_id: str
|
||||||
|
worksheet_types: List[str]
|
||||||
|
pdf_path: str
|
||||||
|
solution_path: Optional[str]
|
||||||
|
generated_at: datetime
|
||||||
@@ -0,0 +1,481 @@
|
|||||||
|
"""
|
||||||
|
Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
|
||||||
|
|
||||||
|
Extracted from vocab_worksheet_api.py to keep file sizes manageable.
|
||||||
|
|
||||||
|
Pipeline steps:
|
||||||
|
orientation → deskew → dewarp → crop → scan-quality → enhance →
|
||||||
|
dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
|
||||||
|
vocab extraction → row merging
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Optional heavy dependencies (not available in every environment)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
except ImportError:
|
||||||
|
cv2 = None # type: ignore[assignment]
|
||||||
|
np = None # type: ignore[assignment]
|
||||||
|
logger.warning("cv2 / numpy not available — OCR pipeline disabled")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError:
|
||||||
|
Image = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
except ImportError:
|
||||||
|
pytesseract = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
# CV pipeline helpers
|
||||||
|
try:
|
||||||
|
from cv_vocab_pipeline import (
|
||||||
|
deskew_two_pass,
|
||||||
|
dewarp_image,
|
||||||
|
detect_and_fix_orientation,
|
||||||
|
_cells_to_vocab_entries,
|
||||||
|
_fix_phonetic_brackets,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
deskew_two_pass = None # type: ignore[assignment]
|
||||||
|
dewarp_image = None # type: ignore[assignment]
|
||||||
|
detect_and_fix_orientation = None # type: ignore[assignment]
|
||||||
|
_cells_to_vocab_entries = None # type: ignore[assignment]
|
||||||
|
_fix_phonetic_brackets = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cv_cell_grid import (
|
||||||
|
_merge_wrapped_rows,
|
||||||
|
_merge_phonetic_continuation_rows,
|
||||||
|
_merge_continuation_rows,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
_merge_wrapped_rows = None # type: ignore[assignment]
|
||||||
|
_merge_phonetic_continuation_rows = None # type: ignore[assignment]
|
||||||
|
_merge_continuation_rows = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cv_ocr_engines import ocr_region_rapid
|
||||||
|
except ImportError:
|
||||||
|
ocr_region_rapid = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cv_vocab_types import PageRegion
|
||||||
|
except ImportError:
|
||||||
|
PageRegion = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from ocr_pipeline_ocr_merge import (
|
||||||
|
_split_paddle_multi_words,
|
||||||
|
_merge_paddle_tesseract,
|
||||||
|
_deduplicate_words,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
_split_paddle_multi_words = None # type: ignore[assignment]
|
||||||
|
_merge_paddle_tesseract = None # type: ignore[assignment]
|
||||||
|
_deduplicate_words = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from cv_words_first import build_grid_from_words
|
||||||
|
except ImportError:
|
||||||
|
build_grid_from_words = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from ocr_pipeline_session_store import (
|
||||||
|
create_session_db as create_pipeline_session_db,
|
||||||
|
update_session_db as update_pipeline_session_db,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
create_pipeline_session_db = None # type: ignore[assignment]
|
||||||
|
update_pipeline_session_db = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main pipeline function
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _run_ocr_pipeline_for_page(
|
||||||
|
img_bgr: "np.ndarray",
|
||||||
|
page_number: int,
|
||||||
|
vocab_session_id: str,
|
||||||
|
*,
|
||||||
|
ipa_mode: str = "none",
|
||||||
|
syllable_mode: str = "none",
|
||||||
|
enable_enhance: bool = True,
|
||||||
|
max_columns: Optional[int] = 3,
|
||||||
|
override_min_conf: Optional[int] = None,
|
||||||
|
) -> tuple:
|
||||||
|
"""Run the full Kombi OCR pipeline on a single page and return vocab entries.
|
||||||
|
|
||||||
|
Uses the same pipeline as the admin OCR Kombi pipeline:
|
||||||
|
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
|
||||||
|
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img_bgr: BGR numpy array.
|
||||||
|
page_number: 0-indexed page number.
|
||||||
|
vocab_session_id: Vocab session ID for logging.
|
||||||
|
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
||||||
|
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
||||||
|
|
||||||
|
Returns (entries, rotation_deg) where entries is a list of dicts and
|
||||||
|
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
||||||
|
"""
|
||||||
|
import time as _time
|
||||||
|
|
||||||
|
t_total = _time.time()
|
||||||
|
img_h, img_w = img_bgr.shape[:2]
|
||||||
|
logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
||||||
|
|
||||||
|
# 1. Orientation detection (fix upside-down scans)
|
||||||
|
t0 = _time.time()
|
||||||
|
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
|
||||||
|
if rotation:
|
||||||
|
img_h, img_w = img_bgr.shape[:2]
|
||||||
|
logger.info(f" orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
|
||||||
|
else:
|
||||||
|
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# 2. Create pipeline session in DB (visible in admin Kombi UI)
|
||||||
|
pipeline_session_id = str(uuid.uuid4())
|
||||||
|
try:
|
||||||
|
_, png_buf = cv2.imencode(".png", img_bgr)
|
||||||
|
original_png = png_buf.tobytes()
|
||||||
|
await create_pipeline_session_db(
|
||||||
|
pipeline_session_id,
|
||||||
|
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
|
||||||
|
filename=f"page_{page_number + 1}.png",
|
||||||
|
original_png=original_png,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not create pipeline session in DB: {e}")
|
||||||
|
|
||||||
|
# 3. Three-pass deskew
|
||||||
|
t0 = _time.time()
|
||||||
|
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
|
||||||
|
logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# 4. Dewarp
|
||||||
|
t0 = _time.time()
|
||||||
|
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
|
||||||
|
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# 5. Content crop (removes scanner borders, gutter shadows)
|
||||||
|
t0 = _time.time()
|
||||||
|
try:
|
||||||
|
from page_crop import detect_and_crop_page
|
||||||
|
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
|
||||||
|
if crop_result.get("crop_applied"):
|
||||||
|
dewarped_bgr = cropped_bgr
|
||||||
|
logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
|
||||||
|
else:
|
||||||
|
logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
|
||||||
|
|
||||||
|
# 5b. Scan quality assessment
|
||||||
|
scan_quality_report = None
|
||||||
|
try:
|
||||||
|
from scan_quality import score_scan_quality
|
||||||
|
scan_quality_report = score_scan_quality(dewarped_bgr)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" scan quality: failed ({e})")
|
||||||
|
|
||||||
|
if override_min_conf:
|
||||||
|
min_ocr_conf = override_min_conf
|
||||||
|
else:
|
||||||
|
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
|
||||||
|
|
||||||
|
# 5c. Image enhancement for degraded scans
|
||||||
|
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
|
||||||
|
if is_degraded and enable_enhance:
|
||||||
|
try:
|
||||||
|
from ocr_image_enhance import enhance_for_ocr
|
||||||
|
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
|
||||||
|
logger.info(" enhancement: applied (degraded scan)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" enhancement: failed ({e})")
|
||||||
|
|
||||||
|
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
|
||||||
|
t0 = _time.time()
|
||||||
|
img_h, img_w = dewarped_bgr.shape[:2]
|
||||||
|
|
||||||
|
# RapidOCR (local ONNX)
|
||||||
|
try:
|
||||||
|
from cv_ocr_engines import ocr_region_rapid
|
||||||
|
from cv_vocab_types import PageRegion
|
||||||
|
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
|
||||||
|
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" RapidOCR failed: {e}")
|
||||||
|
rapid_words = []
|
||||||
|
|
||||||
|
# Tesseract
|
||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
|
||||||
|
data = pytesseract.image_to_data(
|
||||||
|
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
|
||||||
|
output_type=pytesseract.Output.DICT,
|
||||||
|
)
|
||||||
|
tess_words = []
|
||||||
|
for i in range(len(data["text"])):
|
||||||
|
text = str(data["text"][i]).strip()
|
||||||
|
conf_raw = str(data["conf"][i])
|
||||||
|
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||||
|
if not text or conf < min_ocr_conf:
|
||||||
|
continue
|
||||||
|
tess_words.append({
|
||||||
|
"text": text,
|
||||||
|
"left": data["left"][i], "top": data["top"][i],
|
||||||
|
"width": data["width"][i], "height": data["height"][i],
|
||||||
|
"conf": conf,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Merge dual-engine results
|
||||||
|
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
|
||||||
|
from cv_words_first import build_grid_from_words
|
||||||
|
|
||||||
|
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
|
||||||
|
if rapid_split or tess_words:
|
||||||
|
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
|
||||||
|
merged_words = _deduplicate_words(merged_words)
|
||||||
|
else:
|
||||||
|
merged_words = tess_words # fallback to Tesseract only
|
||||||
|
|
||||||
|
# Build initial grid from merged words
|
||||||
|
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
|
||||||
|
for cell in cells:
|
||||||
|
cell["ocr_engine"] = "rapid_kombi"
|
||||||
|
|
||||||
|
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
||||||
|
n_cols = len(columns_meta)
|
||||||
|
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
|
||||||
|
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# 7. Save word_result to pipeline session (needed by _build_grid_core)
|
||||||
|
word_result = {
|
||||||
|
"cells": cells,
|
||||||
|
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
||||||
|
"columns_used": columns_meta,
|
||||||
|
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
|
||||||
|
"image_width": img_w,
|
||||||
|
"image_height": img_h,
|
||||||
|
"duration_seconds": 0,
|
||||||
|
"ocr_engine": "rapid_kombi",
|
||||||
|
"raw_tesseract_words": tess_words,
|
||||||
|
"summary": {
|
||||||
|
"total_cells": len(cells),
|
||||||
|
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Save images + word_result to pipeline session for admin visibility
|
||||||
|
try:
|
||||||
|
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
|
||||||
|
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
|
||||||
|
await update_pipeline_session_db(
|
||||||
|
pipeline_session_id,
|
||||||
|
deskewed_png=dsk_buf.tobytes(),
|
||||||
|
dewarped_png=dwp_buf.tobytes(),
|
||||||
|
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
|
||||||
|
word_result=word_result,
|
||||||
|
deskew_result={"angle_applied": round(angle_applied, 3)},
|
||||||
|
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
|
||||||
|
current_step=8,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not update pipeline session: {e}")
|
||||||
|
|
||||||
|
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
|
||||||
|
t0 = _time.time()
|
||||||
|
try:
|
||||||
|
from grid_editor_api import _build_grid_core
|
||||||
|
session_data = {
|
||||||
|
"word_result": word_result,
|
||||||
|
}
|
||||||
|
grid_result = await _build_grid_core(
|
||||||
|
pipeline_session_id, session_data,
|
||||||
|
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||||
|
)
|
||||||
|
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
|
||||||
|
f"({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# Save grid result to pipeline session
|
||||||
|
try:
|
||||||
|
await update_pipeline_session_db(
|
||||||
|
pipeline_session_id,
|
||||||
|
grid_editor_result=grid_result,
|
||||||
|
current_step=11,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
||||||
|
grid_result = None
|
||||||
|
|
||||||
|
# 9. Extract vocab entries
|
||||||
|
# Prefer grid-build result (better column detection, more cells) over
|
||||||
|
# the initial build_grid_from_words() which often under-clusters.
|
||||||
|
page_vocabulary = []
|
||||||
|
extraction_source = "none"
|
||||||
|
|
||||||
|
# A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
|
||||||
|
if grid_result and grid_result.get("zones"):
|
||||||
|
for zone in grid_result["zones"]:
|
||||||
|
zone_cols = zone.get("columns", [])
|
||||||
|
zone_cells = zone.get("cells", [])
|
||||||
|
if not zone_cols or not zone_cells:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sort columns by x position to determine roles
|
||||||
|
sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
|
||||||
|
col_idx_to_pos = {}
|
||||||
|
for pos, col in enumerate(sorted_cols):
|
||||||
|
ci = col.get("col_index", col.get("index", -1))
|
||||||
|
col_idx_to_pos[ci] = pos
|
||||||
|
|
||||||
|
# Skip zones with only 1 column (likely headers/boxes)
|
||||||
|
if len(sorted_cols) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Group cells by row
|
||||||
|
rows_map: dict = {}
|
||||||
|
for cell in zone_cells:
|
||||||
|
ri = cell.get("row_index", 0)
|
||||||
|
if ri not in rows_map:
|
||||||
|
rows_map[ri] = {}
|
||||||
|
ci = cell.get("col_index", 0)
|
||||||
|
rows_map[ri][ci] = (cell.get("text") or "").strip()
|
||||||
|
|
||||||
|
n_cols = len(sorted_cols)
|
||||||
|
for ri in sorted(rows_map.keys()):
|
||||||
|
row = rows_map[ri]
|
||||||
|
# Collect texts in column-position order
|
||||||
|
texts = []
|
||||||
|
for col in sorted_cols:
|
||||||
|
ci = col.get("col_index", col.get("index", -1))
|
||||||
|
texts.append(row.get(ci, ""))
|
||||||
|
|
||||||
|
if not any(texts):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Map by position, skipping narrow first column (page refs/markers)
|
||||||
|
# Heuristic: if first column is very narrow (<15% of zone width),
|
||||||
|
# it's likely a marker/ref column — skip it for vocab
|
||||||
|
first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
|
||||||
|
zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
|
||||||
|
skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
|
||||||
|
|
||||||
|
data_texts = texts[1:] if skip_first else texts
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": data_texts[0] if len(data_texts) > 0 else "",
|
||||||
|
"german": data_texts[1] if len(data_texts) > 1 else "",
|
||||||
|
"example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
}
|
||||||
|
if entry["english"] or entry["german"]:
|
||||||
|
page_vocabulary.append(entry)
|
||||||
|
|
||||||
|
if page_vocabulary:
|
||||||
|
extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
|
||||||
|
|
||||||
|
# B) Fallback: original cells with column classification
|
||||||
|
if not page_vocabulary:
|
||||||
|
col_types = {c.get("type") for c in columns_meta}
|
||||||
|
is_vocab = bool(col_types & {"column_en", "column_de"})
|
||||||
|
|
||||||
|
if is_vocab:
|
||||||
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||||
|
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
||||||
|
for entry in entries:
|
||||||
|
if not entry.get("english") and not entry.get("german"):
|
||||||
|
continue
|
||||||
|
page_vocabulary.append({
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": entry.get("english", ""),
|
||||||
|
"german": entry.get("german", ""),
|
||||||
|
"example_sentence": entry.get("example", ""),
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
})
|
||||||
|
extraction_source = f"classified ({len(columns_meta)} cols)"
|
||||||
|
else:
|
||||||
|
# Last resort: all cells by position
|
||||||
|
rows_map2: dict = {}
|
||||||
|
for cell in cells:
|
||||||
|
ri = cell.get("row_index", 0)
|
||||||
|
if ri not in rows_map2:
|
||||||
|
rows_map2[ri] = {}
|
||||||
|
ci = cell.get("col_index", 0)
|
||||||
|
rows_map2[ri][ci] = (cell.get("text") or "").strip()
|
||||||
|
all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
|
||||||
|
for ri in sorted(rows_map2.keys()):
|
||||||
|
row = rows_map2[ri]
|
||||||
|
texts = [row.get(ci, "") for ci in all_ci]
|
||||||
|
if not any(texts):
|
||||||
|
continue
|
||||||
|
page_vocabulary.append({
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": texts[0] if len(texts) > 0 else "",
|
||||||
|
"german": texts[1] if len(texts) > 1 else "",
|
||||||
|
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
})
|
||||||
|
extraction_source = f"generic ({len(all_ci)} cols)"
|
||||||
|
|
||||||
|
# --- Post-processing: merge cell-wrap continuation rows ---
|
||||||
|
if len(page_vocabulary) >= 2:
|
||||||
|
try:
|
||||||
|
# Convert to internal format (example_sentence → example)
|
||||||
|
internal = []
|
||||||
|
for v in page_vocabulary:
|
||||||
|
internal.append({
|
||||||
|
'row_index': len(internal),
|
||||||
|
'english': v.get('english', ''),
|
||||||
|
'german': v.get('german', ''),
|
||||||
|
'example': v.get('example_sentence', ''),
|
||||||
|
})
|
||||||
|
|
||||||
|
n_before = len(internal)
|
||||||
|
internal = _merge_wrapped_rows(internal)
|
||||||
|
internal = _merge_phonetic_continuation_rows(internal)
|
||||||
|
internal = _merge_continuation_rows(internal)
|
||||||
|
|
||||||
|
if len(internal) < n_before:
|
||||||
|
# Rebuild page_vocabulary from merged entries
|
||||||
|
merged_vocab = []
|
||||||
|
for entry in internal:
|
||||||
|
if not entry.get('english') and not entry.get('german'):
|
||||||
|
continue
|
||||||
|
merged_vocab.append({
|
||||||
|
'id': str(uuid.uuid4()),
|
||||||
|
'english': entry.get('english', ''),
|
||||||
|
'german': entry.get('german', ''),
|
||||||
|
'example_sentence': entry.get('example', ''),
|
||||||
|
'source_page': page_number + 1,
|
||||||
|
})
|
||||||
|
logger.info(f" row merging: {n_before} → {len(merged_vocab)} entries")
|
||||||
|
page_vocabulary = merged_vocab
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" row merging failed (non-critical): {e}")
|
||||||
|
|
||||||
|
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
|
||||||
|
|
||||||
|
total_duration = _time.time() - t_total
|
||||||
|
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||||
|
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
||||||
|
|
||||||
|
return page_vocabulary, rotation, scan_quality_report
|
||||||
@@ -0,0 +1,490 @@
|
|||||||
|
"""
|
||||||
|
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
|
||||||
|
|
||||||
|
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
|
||||||
|
|
||||||
|
Routes (no prefix — included into the main /api/v1/vocab router):
|
||||||
|
POST /sessions/{session_id}/upload-pdf-info
|
||||||
|
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
||||||
|
GET /sessions/{session_id}/pdf-page-image/{page_number}
|
||||||
|
POST /sessions/{session_id}/process-single-page/{page_number}
|
||||||
|
POST /sessions/{session_id}/process-pages
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
|
||||||
|
from .models import SessionStatus
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Local storage path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Optional heavy dependencies
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
|
||||||
|
OCR_PIPELINE_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
np = None # type: ignore[assignment]
|
||||||
|
OCR_PIPELINE_AVAILABLE = False
|
||||||
|
logger.warning("OCR pipeline imports not available in upload module")
|
||||||
|
|
||||||
|
# Sub-module imports (already split out)
|
||||||
|
from .generation import (
|
||||||
|
convert_pdf_page_to_image,
|
||||||
|
convert_pdf_to_images,
|
||||||
|
get_pdf_page_count,
|
||||||
|
)
|
||||||
|
from .extraction import extract_vocabulary_from_image
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .ocr import _run_ocr_pipeline_for_page
|
||||||
|
except ImportError:
|
||||||
|
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
|
||||||
|
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# In-memory session store (shared with main module)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _get_sessions():
|
||||||
|
from .api import _sessions
|
||||||
|
return _sessions
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Router (no prefix — will be included into the main vocab router)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
upload_router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# POST /sessions/{session_id}/upload-pdf-info
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
|
||||||
|
async def upload_pdf_get_info(
|
||||||
|
session_id: str,
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Upload a PDF and get page count and thumbnails for preview.
|
||||||
|
Use this before processing to let user select pages.
|
||||||
|
"""
|
||||||
|
logger.info(f"PDF info request for session {session_id}")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
|
||||||
|
# Validate file type
|
||||||
|
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
||||||
|
content_type = file.content_type or ''
|
||||||
|
|
||||||
|
if extension != 'pdf' and content_type != 'application/pdf':
|
||||||
|
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
|
||||||
|
|
||||||
|
content = await file.read()
|
||||||
|
|
||||||
|
# Save PDF temporarily
|
||||||
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||||
|
os.makedirs(session_dir, exist_ok=True)
|
||||||
|
pdf_path = os.path.join(session_dir, "source.pdf")
|
||||||
|
|
||||||
|
with open(pdf_path, 'wb') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
# Get page count
|
||||||
|
page_count = get_pdf_page_count(content)
|
||||||
|
|
||||||
|
# Store PDF data in session for later processing
|
||||||
|
session["pdf_data"] = content
|
||||||
|
session["pdf_path"] = pdf_path
|
||||||
|
session["pdf_page_count"] = page_count
|
||||||
|
session["status"] = "pdf_uploaded"
|
||||||
|
|
||||||
|
# Detect orientation for each page so thumbnails are shown correctly
|
||||||
|
page_rotations: dict = {}
|
||||||
|
if OCR_PIPELINE_AVAILABLE:
|
||||||
|
for pg in range(page_count):
|
||||||
|
try:
|
||||||
|
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
|
||||||
|
_, rotation = detect_and_fix_orientation(img_bgr)
|
||||||
|
if rotation:
|
||||||
|
page_rotations[pg] = rotation
|
||||||
|
logger.info(f"Page {pg + 1}: orientation {rotation}°")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
|
||||||
|
session["page_rotations"] = page_rotations
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_count": page_count,
|
||||||
|
"filename": file.filename,
|
||||||
|
"page_rotations": page_rotations,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
|
||||||
|
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
||||||
|
"""Get a thumbnail image of a specific PDF page.
|
||||||
|
|
||||||
|
Uses fitz for rendering so that page_rotations (from OCR orientation
|
||||||
|
detection) are applied consistently.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
||||||
|
"""
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import fitz
|
||||||
|
zoom = 2.0 if hires else 0.5
|
||||||
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||||
|
page = pdf_document[page_number]
|
||||||
|
# Apply orientation correction detected during OCR processing
|
||||||
|
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||||
|
if rot:
|
||||||
|
page.set_rotation(rot)
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
png_data = pix.tobytes("png")
|
||||||
|
pdf_document.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF thumbnail failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
io.BytesIO(png_data),
|
||||||
|
media_type="image/png",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# GET /sessions/{session_id}/pdf-page-image/{page_number}
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
|
||||||
|
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
|
||||||
|
"""PDF page as PNG at arbitrary resolution (for editor view).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
|
||||||
|
"""
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
page_count = session.get("pdf_page_count", 1)
|
||||||
|
if page_number < 0 or page_number >= page_count:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import fitz
|
||||||
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||||
|
page = pdf_document[page_number]
|
||||||
|
# Apply orientation correction detected during OCR processing
|
||||||
|
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||||
|
if rot:
|
||||||
|
page.set_rotation(rot)
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
png_data = pix.tobytes("png")
|
||||||
|
pdf_document.close()
|
||||||
|
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF page image failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
io.BytesIO(png_data),
|
||||||
|
media_type="image/png",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# POST /sessions/{session_id}/process-single-page/{page_number}
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
|
||||||
|
async def process_single_page(
|
||||||
|
session_id: str,
|
||||||
|
page_number: int,
|
||||||
|
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||||
|
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||||
|
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
|
||||||
|
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
|
||||||
|
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
|
||||||
|
|
||||||
|
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
|
||||||
|
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
|
||||||
|
|
||||||
|
Query params:
|
||||||
|
ipa_mode: "none" (default), "auto", "all", "en", "de"
|
||||||
|
syllable_mode: "none" (default), "auto", "all", "en", "de"
|
||||||
|
enhance: true (default) -- apply CLAHE/denoise for degraded scans
|
||||||
|
max_cols: 3 (default) -- max column count (0=unlimited)
|
||||||
|
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
|
||||||
|
|
||||||
|
The frontend should call this sequentially for each page.
|
||||||
|
Returns the vocabulary for just this one page.
|
||||||
|
"""
|
||||||
|
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
|
||||||
|
)
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
page_count = session.get("pdf_page_count", 1)
|
||||||
|
|
||||||
|
if page_number < 0 or page_number >= page_count:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||||
|
|
||||||
|
# Derive pipeline-level variable names for the quality report
|
||||||
|
enable_enhance = enhance
|
||||||
|
max_columns = max_cols if max_cols > 0 else None
|
||||||
|
override_min_conf = min_conf if min_conf > 0 else None
|
||||||
|
|
||||||
|
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||||
|
rotation_deg = 0
|
||||||
|
quality_report = None
|
||||||
|
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
|
||||||
|
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
|
||||||
|
try:
|
||||||
|
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||||
|
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
|
||||||
|
img_bgr, page_number, session_id,
|
||||||
|
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||||
|
enable_enhance=enable_enhance,
|
||||||
|
max_columns=max_columns,
|
||||||
|
override_min_conf=override_min_conf,
|
||||||
|
)
|
||||||
|
# Update min_ocr_conf from quality report if available
|
||||||
|
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
|
||||||
|
min_ocr_conf = quality_report.recommended_min_conf
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_number": page_number + 1,
|
||||||
|
"success": False,
|
||||||
|
"error": f"OCR pipeline error: {e}",
|
||||||
|
"vocabulary": [],
|
||||||
|
"vocabulary_count": 0,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Fallback to LLM vision extraction
|
||||||
|
logger.warning("OCR pipeline not available, falling back to LLM vision")
|
||||||
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
|
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
||||||
|
image_data,
|
||||||
|
f"page_{page_number + 1}.png",
|
||||||
|
page_number=page_number
|
||||||
|
)
|
||||||
|
if error:
|
||||||
|
logger.warning(f"Page {page_number + 1} failed: {error}")
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_number": page_number + 1,
|
||||||
|
"success": False,
|
||||||
|
"error": error,
|
||||||
|
"vocabulary": [],
|
||||||
|
"vocabulary_count": 0,
|
||||||
|
}
|
||||||
|
page_vocabulary = []
|
||||||
|
for entry in vocabulary:
|
||||||
|
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
||||||
|
entry_dict['source_page'] = page_number + 1
|
||||||
|
if 'id' not in entry_dict or not entry_dict['id']:
|
||||||
|
entry_dict['id'] = str(uuid.uuid4())
|
||||||
|
page_vocabulary.append(entry_dict)
|
||||||
|
|
||||||
|
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
||||||
|
|
||||||
|
# Store rotation for this page (used by image/thumbnail endpoints)
|
||||||
|
session.setdefault("page_rotations", {})[page_number] = rotation_deg
|
||||||
|
|
||||||
|
# Add to session's vocabulary (append, don't replace)
|
||||||
|
existing_vocab = session.get("vocabulary", [])
|
||||||
|
# Remove any existing entries from this page (in case of re-processing)
|
||||||
|
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
|
||||||
|
existing_vocab.extend(page_vocabulary)
|
||||||
|
session["vocabulary"] = existing_vocab
|
||||||
|
session["vocabulary_count"] = len(existing_vocab)
|
||||||
|
session["status"] = SessionStatus.EXTRACTED.value
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"page_number": page_number + 1,
|
||||||
|
"success": True,
|
||||||
|
"vocabulary": page_vocabulary,
|
||||||
|
"vocabulary_count": len(page_vocabulary),
|
||||||
|
"total_vocabulary_count": len(existing_vocab),
|
||||||
|
"extraction_confidence": 0.9,
|
||||||
|
"rotation": rotation_deg,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add scan quality report + active steps info
|
||||||
|
if quality_report:
|
||||||
|
sq = quality_report.to_dict()
|
||||||
|
sq["active_steps"] = {
|
||||||
|
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
|
||||||
|
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
|
||||||
|
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
|
||||||
|
}
|
||||||
|
result["scan_quality"] = sq
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# POST /sessions/{session_id}/process-pages (DEPRECATED)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@upload_router.post("/sessions/{session_id}/process-pages")
|
||||||
|
async def process_pdf_pages(
|
||||||
|
session_id: str,
|
||||||
|
pages: List[int] = None,
|
||||||
|
process_all: bool = False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Process specific pages of an uploaded PDF.
|
||||||
|
|
||||||
|
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pages: List of 0-indexed page numbers to process
|
||||||
|
process_all: If True, process all pages
|
||||||
|
"""
|
||||||
|
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
|
||||||
|
|
||||||
|
if session_id not in _get_sessions():
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _get_sessions()[session_id]
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
page_count = session.get("pdf_page_count", 1)
|
||||||
|
|
||||||
|
# Determine which pages to process
|
||||||
|
if process_all:
|
||||||
|
pages = list(range(page_count))
|
||||||
|
elif pages is None or len(pages) == 0:
|
||||||
|
pages = [0] # Default to first page
|
||||||
|
|
||||||
|
# Convert selected pages to images
|
||||||
|
images = await convert_pdf_to_images(pdf_data, pages)
|
||||||
|
|
||||||
|
# Extract vocabulary from each page SEQUENTIALLY
|
||||||
|
all_vocabulary = []
|
||||||
|
total_confidence = 0.0
|
||||||
|
successful_pages = []
|
||||||
|
failed_pages = []
|
||||||
|
error_messages = []
|
||||||
|
|
||||||
|
for i, image_data in enumerate(images):
|
||||||
|
page_num = pages[i]
|
||||||
|
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
|
||||||
|
|
||||||
|
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
||||||
|
image_data,
|
||||||
|
f"page_{page_num + 1}.png",
|
||||||
|
page_number=page_num
|
||||||
|
)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
failed_pages.append(page_num + 1)
|
||||||
|
error_messages.append(error)
|
||||||
|
logger.warning(f"Page {page_num + 1} failed: {error}")
|
||||||
|
else:
|
||||||
|
successful_pages.append(page_num + 1)
|
||||||
|
total_confidence += confidence
|
||||||
|
|
||||||
|
# Add page info to each entry and convert to dict
|
||||||
|
for entry in vocabulary:
|
||||||
|
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
||||||
|
entry_dict['source_page'] = page_num + 1
|
||||||
|
all_vocabulary.append(entry_dict)
|
||||||
|
|
||||||
|
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
|
||||||
|
|
||||||
|
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
|
||||||
|
|
||||||
|
# Update session
|
||||||
|
session["vocabulary"] = all_vocabulary
|
||||||
|
session["vocabulary_count"] = len(all_vocabulary)
|
||||||
|
session["extraction_confidence"] = avg_confidence
|
||||||
|
session["processed_pages"] = pages
|
||||||
|
session["successful_pages"] = successful_pages
|
||||||
|
session["failed_pages"] = failed_pages
|
||||||
|
session["status"] = SessionStatus.EXTRACTED.value
|
||||||
|
|
||||||
|
# Save first page as preview image
|
||||||
|
if images:
|
||||||
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||||
|
image_path = os.path.join(session_dir, "source.png")
|
||||||
|
with open(image_path, 'wb') as f:
|
||||||
|
f.write(images[0])
|
||||||
|
session["image_path"] = image_path
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"pages_processed": len(pages),
|
||||||
|
"pages_successful": len(successful_pages),
|
||||||
|
"pages_failed": len(failed_pages),
|
||||||
|
"successful_pages": successful_pages,
|
||||||
|
"failed_pages": failed_pages,
|
||||||
|
"vocabulary_count": len(all_vocabulary),
|
||||||
|
"extraction_confidence": avg_confidence,
|
||||||
|
"status": SessionStatus.EXTRACTED.value,
|
||||||
|
}
|
||||||
|
|
||||||
|
if error_messages:
|
||||||
|
result["errors"] = error_messages
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -1,196 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/learn_bridge.py
|
||||||
Vocab Learn Bridge — Converts vocabulary session data into Learning Units.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Bridges klausur-service (vocab extraction) with backend-lehrer (learning units + generators).
|
_sys.modules[__name__] = _importlib.import_module("vocab.learn_bridge")
|
||||||
Creates a Learning Unit in backend-lehrer, then triggers MC/Cloze/QA generation.
|
|
||||||
|
|
||||||
DATENSCHUTZ: All communication stays within Docker network (breakpilot-network).
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import httpx
|
|
||||||
from typing import List, Dict, Any, Optional
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
BACKEND_LEHRER_URL = os.getenv("BACKEND_LEHRER_URL", "http://backend-lehrer:8001")
|
|
||||||
|
|
||||||
|
|
||||||
def vocab_to_analysis_data(session_name: str, vocabulary: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Convert vocabulary entries from a vocab session into the analysis_data format
|
|
||||||
expected by backend-lehrer generators (MC, Cloze, QA).
|
|
||||||
|
|
||||||
The generators consume:
|
|
||||||
- title: Display name
|
|
||||||
- subject: Subject area
|
|
||||||
- grade_level: Target grade
|
|
||||||
- canonical_text: Full text representation
|
|
||||||
- printed_blocks: Individual text blocks
|
|
||||||
- vocabulary: Original vocab data (for vocab-specific modules)
|
|
||||||
"""
|
|
||||||
canonical_lines = []
|
|
||||||
printed_blocks = []
|
|
||||||
|
|
||||||
for v in vocabulary:
|
|
||||||
en = v.get("english", "").strip()
|
|
||||||
de = v.get("german", "").strip()
|
|
||||||
example = v.get("example_sentence", "").strip()
|
|
||||||
|
|
||||||
if not en and not de:
|
|
||||||
continue
|
|
||||||
|
|
||||||
line = f"{en} = {de}"
|
|
||||||
if example:
|
|
||||||
line += f" ({example})"
|
|
||||||
canonical_lines.append(line)
|
|
||||||
|
|
||||||
block_text = f"{en} — {de}"
|
|
||||||
if example:
|
|
||||||
block_text += f" | {example}"
|
|
||||||
printed_blocks.append({"text": block_text})
|
|
||||||
|
|
||||||
return {
|
|
||||||
"title": session_name,
|
|
||||||
"subject": "English Vocabulary",
|
|
||||||
"grade_level": "5-8",
|
|
||||||
"canonical_text": "\n".join(canonical_lines),
|
|
||||||
"printed_blocks": printed_blocks,
|
|
||||||
"vocabulary": vocabulary,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async def create_learning_unit(
|
|
||||||
session_name: str,
|
|
||||||
vocabulary: List[Dict[str, Any]],
|
|
||||||
grade: Optional[str] = None,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Create a Learning Unit in backend-lehrer from vocabulary data.
|
|
||||||
|
|
||||||
Steps:
|
|
||||||
1. Create unit via POST /api/learning-units/
|
|
||||||
2. Return the created unit info
|
|
||||||
|
|
||||||
Returns dict with unit_id, status, vocabulary_count.
|
|
||||||
"""
|
|
||||||
if not vocabulary:
|
|
||||||
raise ValueError("No vocabulary entries provided")
|
|
||||||
|
|
||||||
analysis_data = vocab_to_analysis_data(session_name, vocabulary)
|
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
||||||
# 1. Create Learning Unit
|
|
||||||
create_payload = {
|
|
||||||
"title": session_name,
|
|
||||||
"subject": "Englisch",
|
|
||||||
"grade": grade or "5-8",
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
resp = await client.post(
|
|
||||||
f"{BACKEND_LEHRER_URL}/api/learning-units/",
|
|
||||||
json=create_payload,
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
|
||||||
unit = resp.json()
|
|
||||||
except httpx.HTTPError as e:
|
|
||||||
logger.error(f"Failed to create learning unit: {e}")
|
|
||||||
raise RuntimeError(f"Backend-Lehrer nicht erreichbar: {e}")
|
|
||||||
|
|
||||||
unit_id = unit.get("id")
|
|
||||||
if not unit_id:
|
|
||||||
raise RuntimeError("Learning Unit created but no ID returned")
|
|
||||||
|
|
||||||
logger.info(f"Created learning unit {unit_id} with {len(vocabulary)} vocabulary entries")
|
|
||||||
|
|
||||||
# 2. Save analysis_data as JSON file for generators
|
|
||||||
analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
|
|
||||||
os.makedirs(analysis_dir, exist_ok=True)
|
|
||||||
analysis_path = os.path.join(analysis_dir, f"{unit_id}_analyse.json")
|
|
||||||
|
|
||||||
with open(analysis_path, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(analysis_data, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
logger.info(f"Saved analysis data to {analysis_path}")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"unit_id": unit_id,
|
|
||||||
"unit": unit,
|
|
||||||
"analysis_path": analysis_path,
|
|
||||||
"vocabulary_count": len(vocabulary),
|
|
||||||
"status": "created",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async def generate_learning_modules(
|
|
||||||
unit_id: str,
|
|
||||||
analysis_path: str,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Trigger MC, Cloze, and QA generation from analysis data.
|
|
||||||
|
|
||||||
Imports generators directly (they run in-process for klausur-service)
|
|
||||||
or calls backend-lehrer API if generators aren't available locally.
|
|
||||||
|
|
||||||
Returns dict with generation results.
|
|
||||||
"""
|
|
||||||
results = {
|
|
||||||
"unit_id": unit_id,
|
|
||||||
"mc": {"status": "pending"},
|
|
||||||
"cloze": {"status": "pending"},
|
|
||||||
"qa": {"status": "pending"},
|
|
||||||
}
|
|
||||||
|
|
||||||
# Load analysis data
|
|
||||||
with open(analysis_path, "r", encoding="utf-8") as f:
|
|
||||||
analysis_data = json.load(f)
|
|
||||||
|
|
||||||
# Try to generate via backend-lehrer API
|
|
||||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
||||||
# Generate QA (includes Leitner fields)
|
|
||||||
try:
|
|
||||||
resp = await client.post(
|
|
||||||
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-qa",
|
|
||||||
json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 20)},
|
|
||||||
)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
results["qa"] = {"status": "generated", "data": resp.json()}
|
|
||||||
else:
|
|
||||||
logger.warning(f"QA generation returned {resp.status_code}")
|
|
||||||
results["qa"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"QA generation failed: {e}")
|
|
||||||
results["qa"] = {"status": "error", "reason": str(e)}
|
|
||||||
|
|
||||||
# Generate MC
|
|
||||||
try:
|
|
||||||
resp = await client.post(
|
|
||||||
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-mc",
|
|
||||||
json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 10)},
|
|
||||||
)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
results["mc"] = {"status": "generated", "data": resp.json()}
|
|
||||||
else:
|
|
||||||
results["mc"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"MC generation failed: {e}")
|
|
||||||
results["mc"] = {"status": "error", "reason": str(e)}
|
|
||||||
|
|
||||||
# Generate Cloze
|
|
||||||
try:
|
|
||||||
resp = await client.post(
|
|
||||||
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-cloze",
|
|
||||||
json={"analysis_data": analysis_data},
|
|
||||||
)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
results["cloze"] = {"status": "generated", "data": resp.json()}
|
|
||||||
else:
|
|
||||||
results["cloze"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Cloze generation failed: {e}")
|
|
||||||
results["cloze"] = {"status": "error", "reason": str(e)}
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|||||||
@@ -1,428 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/session_store.py
|
||||||
Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Replaces in-memory storage with database persistence.
|
_sys.modules[__name__] = _importlib.import_module("vocab.session_store")
|
||||||
See migrations/001_vocab_sessions.sql for schema.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import uuid
|
|
||||||
import logging
|
|
||||||
import json
|
|
||||||
from typing import Optional, List, Dict, Any
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Database configuration
|
|
||||||
DATABASE_URL = os.getenv(
|
|
||||||
"DATABASE_URL",
|
|
||||||
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Connection pool (initialized lazily)
|
|
||||||
_pool: Optional[asyncpg.Pool] = None
|
|
||||||
|
|
||||||
|
|
||||||
async def get_pool() -> asyncpg.Pool:
|
|
||||||
"""Get or create the database connection pool."""
|
|
||||||
global _pool
|
|
||||||
if _pool is None:
|
|
||||||
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
|
|
||||||
return _pool
|
|
||||||
|
|
||||||
|
|
||||||
async def init_vocab_tables():
|
|
||||||
"""
|
|
||||||
Initialize vocab tables if they don't exist.
|
|
||||||
This is called at startup.
|
|
||||||
"""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
# Check if tables exist
|
|
||||||
tables_exist = await conn.fetchval("""
|
|
||||||
SELECT EXISTS (
|
|
||||||
SELECT FROM information_schema.tables
|
|
||||||
WHERE table_name = 'vocab_sessions'
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
if not tables_exist:
|
|
||||||
logger.info("Creating vocab tables...")
|
|
||||||
# Read and execute migration
|
|
||||||
migration_path = os.path.join(
|
|
||||||
os.path.dirname(__file__),
|
|
||||||
"migrations/001_vocab_sessions.sql"
|
|
||||||
)
|
|
||||||
if os.path.exists(migration_path):
|
|
||||||
with open(migration_path, "r") as f:
|
|
||||||
sql = f.read()
|
|
||||||
await conn.execute(sql)
|
|
||||||
logger.info("Vocab tables created successfully")
|
|
||||||
else:
|
|
||||||
logger.warning(f"Migration file not found: {migration_path}")
|
|
||||||
else:
|
|
||||||
logger.debug("Vocab tables already exist")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# SESSION OPERATIONS
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
async def create_session_db(
|
|
||||||
session_id: str,
|
|
||||||
name: str,
|
|
||||||
description: str = "",
|
|
||||||
source_language: str = "en",
|
|
||||||
target_language: str = "de"
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Create a new vocabulary session in the database."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
row = await conn.fetchrow("""
|
|
||||||
INSERT INTO vocab_sessions (
|
|
||||||
id, name, description, source_language, target_language,
|
|
||||||
status, vocabulary_count
|
|
||||||
) VALUES ($1, $2, $3, $4, $5, 'pending', 0)
|
|
||||||
RETURNING *
|
|
||||||
""", uuid.UUID(session_id), name, description, source_language, target_language)
|
|
||||||
|
|
||||||
return _row_to_dict(row)
|
|
||||||
|
|
||||||
|
|
||||||
async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Get a session by ID."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
row = await conn.fetchrow("""
|
|
||||||
SELECT * FROM vocab_sessions WHERE id = $1
|
|
||||||
""", uuid.UUID(session_id))
|
|
||||||
|
|
||||||
if row:
|
|
||||||
return _row_to_dict(row)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def list_sessions_db(
|
|
||||||
limit: int = 50,
|
|
||||||
offset: int = 0,
|
|
||||||
status: Optional[str] = None
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
"""List all sessions with optional filtering."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
if status:
|
|
||||||
rows = await conn.fetch("""
|
|
||||||
SELECT * FROM vocab_sessions
|
|
||||||
WHERE status = $1
|
|
||||||
ORDER BY created_at DESC
|
|
||||||
LIMIT $2 OFFSET $3
|
|
||||||
""", status, limit, offset)
|
|
||||||
else:
|
|
||||||
rows = await conn.fetch("""
|
|
||||||
SELECT * FROM vocab_sessions
|
|
||||||
ORDER BY created_at DESC
|
|
||||||
LIMIT $1 OFFSET $2
|
|
||||||
""", limit, offset)
|
|
||||||
|
|
||||||
return [_row_to_dict(row) for row in rows]
|
|
||||||
|
|
||||||
|
|
||||||
async def update_session_db(
|
|
||||||
session_id: str,
|
|
||||||
**kwargs
|
|
||||||
) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Update a session with given fields."""
|
|
||||||
pool = await get_pool()
|
|
||||||
|
|
||||||
# Build dynamic UPDATE query
|
|
||||||
fields = []
|
|
||||||
values = []
|
|
||||||
param_idx = 1
|
|
||||||
|
|
||||||
allowed_fields = [
|
|
||||||
'name', 'description', 'status', 'vocabulary_count',
|
|
||||||
'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count',
|
|
||||||
'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages'
|
|
||||||
]
|
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
if key in allowed_fields:
|
|
||||||
fields.append(f"{key} = ${param_idx}")
|
|
||||||
# Convert dicts/lists to JSON for JSONB columns
|
|
||||||
if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']:
|
|
||||||
value = json.dumps(value) if value else None
|
|
||||||
values.append(value)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
if not fields:
|
|
||||||
return await get_session_db(session_id)
|
|
||||||
|
|
||||||
values.append(uuid.UUID(session_id))
|
|
||||||
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
row = await conn.fetchrow(f"""
|
|
||||||
UPDATE vocab_sessions
|
|
||||||
SET {', '.join(fields)}
|
|
||||||
WHERE id = ${param_idx}
|
|
||||||
RETURNING *
|
|
||||||
""", *values)
|
|
||||||
|
|
||||||
if row:
|
|
||||||
return _row_to_dict(row)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def delete_session_db(session_id: str) -> bool:
|
|
||||||
"""Delete a session and all related data (cascades)."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
result = await conn.execute("""
|
|
||||||
DELETE FROM vocab_sessions WHERE id = $1
|
|
||||||
""", uuid.UUID(session_id))
|
|
||||||
return result == "DELETE 1"
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# VOCABULARY OPERATIONS
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
async def add_vocabulary_db(
|
|
||||||
session_id: str,
|
|
||||||
vocab_list: List[Dict[str, Any]]
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
"""Add vocabulary entries to a session."""
|
|
||||||
if not vocab_list:
|
|
||||||
return []
|
|
||||||
|
|
||||||
pool = await get_pool()
|
|
||||||
results = []
|
|
||||||
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
for vocab in vocab_list:
|
|
||||||
vocab_id = str(uuid.uuid4())
|
|
||||||
row = await conn.fetchrow("""
|
|
||||||
INSERT INTO vocab_entries (
|
|
||||||
id, session_id, english, german, example_sentence,
|
|
||||||
example_sentence_gap, word_type, source_page
|
|
||||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
||||||
RETURNING *
|
|
||||||
""",
|
|
||||||
uuid.UUID(vocab_id),
|
|
||||||
uuid.UUID(session_id),
|
|
||||||
vocab.get('english', ''),
|
|
||||||
vocab.get('german', ''),
|
|
||||||
vocab.get('example_sentence'),
|
|
||||||
vocab.get('example_sentence_gap'),
|
|
||||||
vocab.get('word_type'),
|
|
||||||
vocab.get('source_page')
|
|
||||||
)
|
|
||||||
results.append(_row_to_dict(row))
|
|
||||||
|
|
||||||
# Update vocabulary count
|
|
||||||
await conn.execute("""
|
|
||||||
UPDATE vocab_sessions
|
|
||||||
SET vocabulary_count = (
|
|
||||||
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
|
|
||||||
)
|
|
||||||
WHERE id = $1
|
|
||||||
""", uuid.UUID(session_id))
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
async def get_vocabulary_db(
|
|
||||||
session_id: str,
|
|
||||||
source_page: Optional[int] = None
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
"""Get vocabulary entries for a session."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
if source_page is not None:
|
|
||||||
rows = await conn.fetch("""
|
|
||||||
SELECT * FROM vocab_entries
|
|
||||||
WHERE session_id = $1 AND source_page = $2
|
|
||||||
ORDER BY created_at
|
|
||||||
""", uuid.UUID(session_id), source_page)
|
|
||||||
else:
|
|
||||||
rows = await conn.fetch("""
|
|
||||||
SELECT * FROM vocab_entries
|
|
||||||
WHERE session_id = $1
|
|
||||||
ORDER BY source_page NULLS LAST, created_at
|
|
||||||
""", uuid.UUID(session_id))
|
|
||||||
|
|
||||||
return [_row_to_dict(row) for row in rows]
|
|
||||||
|
|
||||||
|
|
||||||
async def update_vocabulary_db(
|
|
||||||
entry_id: str,
|
|
||||||
**kwargs
|
|
||||||
) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Update a single vocabulary entry."""
|
|
||||||
pool = await get_pool()
|
|
||||||
|
|
||||||
fields = []
|
|
||||||
values = []
|
|
||||||
param_idx = 1
|
|
||||||
|
|
||||||
allowed_fields = [
|
|
||||||
'english', 'german', 'example_sentence', 'example_sentence_gap',
|
|
||||||
'word_type', 'source_page'
|
|
||||||
]
|
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
if key in allowed_fields:
|
|
||||||
fields.append(f"{key} = ${param_idx}")
|
|
||||||
values.append(value)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
if not fields:
|
|
||||||
return None
|
|
||||||
|
|
||||||
values.append(uuid.UUID(entry_id))
|
|
||||||
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
row = await conn.fetchrow(f"""
|
|
||||||
UPDATE vocab_entries
|
|
||||||
SET {', '.join(fields)}
|
|
||||||
WHERE id = ${param_idx}
|
|
||||||
RETURNING *
|
|
||||||
""", *values)
|
|
||||||
|
|
||||||
if row:
|
|
||||||
return _row_to_dict(row)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def clear_page_vocabulary_db(session_id: str, page: int) -> int:
|
|
||||||
"""Clear all vocabulary for a specific page."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
result = await conn.execute("""
|
|
||||||
DELETE FROM vocab_entries
|
|
||||||
WHERE session_id = $1 AND source_page = $2
|
|
||||||
""", uuid.UUID(session_id), page)
|
|
||||||
|
|
||||||
# Update vocabulary count
|
|
||||||
await conn.execute("""
|
|
||||||
UPDATE vocab_sessions
|
|
||||||
SET vocabulary_count = (
|
|
||||||
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
|
|
||||||
)
|
|
||||||
WHERE id = $1
|
|
||||||
""", uuid.UUID(session_id))
|
|
||||||
|
|
||||||
# Return count of deleted rows
|
|
||||||
count = int(result.split()[-1]) if result else 0
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# WORKSHEET OPERATIONS
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
async def create_worksheet_db(
|
|
||||||
session_id: str,
|
|
||||||
worksheet_types: List[str],
|
|
||||||
pdf_path: Optional[str] = None,
|
|
||||||
solution_path: Optional[str] = None
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Create a worksheet record."""
|
|
||||||
pool = await get_pool()
|
|
||||||
worksheet_id = str(uuid.uuid4())
|
|
||||||
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
row = await conn.fetchrow("""
|
|
||||||
INSERT INTO vocab_worksheets (
|
|
||||||
id, session_id, worksheet_types, pdf_path, solution_path
|
|
||||||
) VALUES ($1, $2, $3, $4, $5)
|
|
||||||
RETURNING *
|
|
||||||
""",
|
|
||||||
uuid.UUID(worksheet_id),
|
|
||||||
uuid.UUID(session_id),
|
|
||||||
json.dumps(worksheet_types),
|
|
||||||
pdf_path,
|
|
||||||
solution_path
|
|
||||||
)
|
|
||||||
|
|
||||||
return _row_to_dict(row)
|
|
||||||
|
|
||||||
|
|
||||||
async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Get a worksheet by ID."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
row = await conn.fetchrow("""
|
|
||||||
SELECT * FROM vocab_worksheets WHERE id = $1
|
|
||||||
""", uuid.UUID(worksheet_id))
|
|
||||||
|
|
||||||
if row:
|
|
||||||
return _row_to_dict(row)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def delete_worksheets_for_session_db(session_id: str) -> int:
|
|
||||||
"""Delete all worksheets for a session."""
|
|
||||||
pool = await get_pool()
|
|
||||||
async with pool.acquire() as conn:
|
|
||||||
result = await conn.execute("""
|
|
||||||
DELETE FROM vocab_worksheets WHERE session_id = $1
|
|
||||||
""", uuid.UUID(session_id))
|
|
||||||
|
|
||||||
count = int(result.split()[-1]) if result else 0
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# PDF CACHE OPERATIONS
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# Simple in-memory cache for PDF data (temporary until served)
|
|
||||||
_pdf_cache: Dict[str, bytes] = {}
|
|
||||||
|
|
||||||
|
|
||||||
def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None:
|
|
||||||
"""Cache PDF data temporarily for download."""
|
|
||||||
_pdf_cache[worksheet_id] = pdf_data
|
|
||||||
|
|
||||||
|
|
||||||
def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]:
|
|
||||||
"""Get cached PDF data."""
|
|
||||||
return _pdf_cache.get(worksheet_id)
|
|
||||||
|
|
||||||
|
|
||||||
def clear_cached_pdf_data(worksheet_id: str) -> None:
|
|
||||||
"""Clear cached PDF data."""
|
|
||||||
_pdf_cache.pop(worksheet_id, None)
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# HELPER FUNCTIONS
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
|
|
||||||
"""Convert asyncpg Record to dict with proper type handling."""
|
|
||||||
if row is None:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
result = dict(row)
|
|
||||||
|
|
||||||
# Convert UUIDs to strings
|
|
||||||
for key in ['id', 'session_id']:
|
|
||||||
if key in result and result[key] is not None:
|
|
||||||
result[key] = str(result[key])
|
|
||||||
|
|
||||||
# Convert datetimes to ISO strings
|
|
||||||
for key in ['created_at', 'updated_at', 'generated_at']:
|
|
||||||
if key in result and result[key] is not None:
|
|
||||||
result[key] = result[key].isoformat()
|
|
||||||
|
|
||||||
# Parse JSONB fields back to dicts/lists
|
|
||||||
for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']:
|
|
||||||
if key in result and result[key] is not None:
|
|
||||||
if isinstance(result[key], str):
|
|
||||||
result[key] = json.loads(result[key])
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
@@ -1,472 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/worksheet/analysis_api.py
|
||||||
Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
|
import importlib as _importlib
|
||||||
extract-with-boxes, deskewed images, and learning unit generation.
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.analysis_api")
|
||||||
The two large handlers (compare_ocr_methods, analyze_grid) live in
|
|
||||||
vocab_worksheet_compare_api.py and are included via compare_router.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Body, HTTPException
|
|
||||||
from fastapi.responses import StreamingResponse
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from typing import Optional, Dict, Any
|
|
||||||
from datetime import datetime
|
|
||||||
import os
|
|
||||||
import io
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
|
|
||||||
def _get_sessions():
|
|
||||||
from vocab_worksheet_api import _sessions
|
|
||||||
return _sessions
|
|
||||||
|
|
||||||
def _get_local_storage_path():
|
|
||||||
from vocab_worksheet_api import LOCAL_STORAGE_PATH
|
|
||||||
return LOCAL_STORAGE_PATH
|
|
||||||
from vocab_worksheet_generation import convert_pdf_page_to_image
|
|
||||||
|
|
||||||
# Try to import Tesseract extractor
|
|
||||||
try:
|
|
||||||
from tesseract_vocab_extractor import (
|
|
||||||
extract_bounding_boxes, TESSERACT_AVAILABLE,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
TESSERACT_AVAILABLE = False
|
|
||||||
|
|
||||||
# Try to import Grid Detection Service
|
|
||||||
try:
|
|
||||||
from services.grid_detection_service import GridDetectionService
|
|
||||||
GRID_SERVICE_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
GRID_SERVICE_AVAILABLE = False
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
analysis_router = APIRouter()
|
|
||||||
|
|
||||||
def _ocr_export_dir():
|
|
||||||
return os.path.join(_get_local_storage_path(), "ocr-exports")
|
|
||||||
|
|
||||||
def _ground_truth_dir():
|
|
||||||
return os.path.join(_get_local_storage_path(), "ground-truth")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# OCR Export Endpoints (for cross-app OCR data sharing)
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
|
|
||||||
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
|
|
||||||
"""
|
|
||||||
Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
|
|
||||||
|
|
||||||
Both apps proxy to klausur-service via /klausur-api/, so this endpoint
|
|
||||||
serves as shared storage accessible from both ports.
|
|
||||||
"""
|
|
||||||
|
|
||||||
logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
|
|
||||||
|
|
||||||
os.makedirs(_ocr_export_dir(), exist_ok=True)
|
|
||||||
|
|
||||||
# Save the export data
|
|
||||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
|
||||||
with open(export_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
# Update latest pointer
|
|
||||||
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
|
||||||
with open(latest_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump({
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_number": page_number,
|
|
||||||
"saved_at": datetime.utcnow().isoformat(),
|
|
||||||
}, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_number": page_number,
|
|
||||||
"message": "OCR export saved successfully",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
|
|
||||||
async def load_ocr_export(session_id: str, page_number: int):
|
|
||||||
"""Load a specific OCR export by session and page number."""
|
|
||||||
|
|
||||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
|
||||||
|
|
||||||
if not os.path.exists(export_path):
|
|
||||||
raise HTTPException(status_code=404, detail="OCR export not found")
|
|
||||||
|
|
||||||
with open(export_path, 'r', encoding='utf-8') as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.get("/ocr-export/latest")
|
|
||||||
async def load_latest_ocr_export():
|
|
||||||
"""Load the most recently saved OCR export data."""
|
|
||||||
|
|
||||||
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
|
||||||
|
|
||||||
if not os.path.exists(latest_path):
|
|
||||||
raise HTTPException(status_code=404, detail="No OCR exports found")
|
|
||||||
|
|
||||||
with open(latest_path, 'r', encoding='utf-8') as f:
|
|
||||||
pointer = json.load(f)
|
|
||||||
|
|
||||||
session_id = pointer.get("session_id")
|
|
||||||
page_number = pointer.get("page_number")
|
|
||||||
|
|
||||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
|
||||||
|
|
||||||
if not os.path.exists(export_path):
|
|
||||||
raise HTTPException(status_code=404, detail="Latest OCR export file not found")
|
|
||||||
|
|
||||||
with open(export_path, 'r', encoding='utf-8') as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Extract with Boxes & Deskewed Image
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
|
|
||||||
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
|
|
||||||
|
|
||||||
Returns dict with 'entries' list and 'image_width'/'image_height'.
|
|
||||||
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
|
|
||||||
All bbox coordinates are in percent (0-100).
|
|
||||||
"""
|
|
||||||
if not TESSERACT_AVAILABLE:
|
|
||||||
raise HTTPException(status_code=500, detail="Tesseract not available")
|
|
||||||
if not GRID_SERVICE_AVAILABLE:
|
|
||||||
raise HTTPException(status_code=500, detail="GridDetectionService not available")
|
|
||||||
|
|
||||||
# Step 1: Tesseract word-level bounding boxes
|
|
||||||
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
|
|
||||||
words = tess_result.get("words", [])
|
|
||||||
img_w = tess_result.get("image_width", 0)
|
|
||||||
img_h = tess_result.get("image_height", 0)
|
|
||||||
|
|
||||||
if not words or img_w == 0 or img_h == 0:
|
|
||||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
|
||||||
|
|
||||||
# Step 2: Convert to OCR regions (percentage-based)
|
|
||||||
service = GridDetectionService()
|
|
||||||
regions = service.convert_tesseract_regions(words, img_w, img_h)
|
|
||||||
|
|
||||||
if not regions:
|
|
||||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
|
||||||
|
|
||||||
# Step 3: Detect grid
|
|
||||||
grid_result = service.detect_grid(regions)
|
|
||||||
|
|
||||||
if not grid_result.cells:
|
|
||||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
|
||||||
|
|
||||||
# Step 4: Group cells by logical_row and column_type
|
|
||||||
from services.grid_detection_service import ColumnType
|
|
||||||
|
|
||||||
entries = []
|
|
||||||
for row_idx, row_cells in enumerate(grid_result.cells):
|
|
||||||
en_text = ""
|
|
||||||
de_text = ""
|
|
||||||
ex_text = ""
|
|
||||||
en_bbox = None
|
|
||||||
de_bbox = None
|
|
||||||
ex_bbox = None
|
|
||||||
row_conf_sum = 0.0
|
|
||||||
row_conf_count = 0
|
|
||||||
|
|
||||||
for cell in row_cells:
|
|
||||||
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
|
|
||||||
"w": round(cell.width, 2), "h": round(cell.height, 2)}
|
|
||||||
|
|
||||||
if cell.column_type == ColumnType.ENGLISH:
|
|
||||||
en_text = cell.text.strip()
|
|
||||||
en_bbox = cell_bbox
|
|
||||||
elif cell.column_type == ColumnType.GERMAN:
|
|
||||||
de_text = cell.text.strip()
|
|
||||||
de_bbox = cell_bbox
|
|
||||||
elif cell.column_type == ColumnType.EXAMPLE:
|
|
||||||
ex_text = cell.text.strip()
|
|
||||||
ex_bbox = cell_bbox
|
|
||||||
|
|
||||||
if cell.text.strip():
|
|
||||||
row_conf_sum += cell.confidence
|
|
||||||
row_conf_count += 1
|
|
||||||
|
|
||||||
# Skip completely empty rows
|
|
||||||
if not en_text and not de_text and not ex_text:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Calculate whole-row bounding box
|
|
||||||
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
|
|
||||||
if all_bboxes:
|
|
||||||
row_x = min(b["x"] for b in all_bboxes)
|
|
||||||
row_y = min(b["y"] for b in all_bboxes)
|
|
||||||
row_right = max(b["x"] + b["w"] for b in all_bboxes)
|
|
||||||
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
|
|
||||||
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
|
|
||||||
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
|
|
||||||
else:
|
|
||||||
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
|
|
||||||
|
|
||||||
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
|
|
||||||
|
|
||||||
entries.append({
|
|
||||||
"row_index": row_idx,
|
|
||||||
"english": en_text,
|
|
||||||
"german": de_text,
|
|
||||||
"example": ex_text,
|
|
||||||
"confidence": avg_conf,
|
|
||||||
"bbox": row_bbox,
|
|
||||||
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
|
||||||
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
|
||||||
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
|
||||||
})
|
|
||||||
|
|
||||||
return {"entries": entries, "image_width": img_w, "image_height": img_h}
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
|
|
||||||
async def extract_with_boxes(session_id: str, page_number: int):
|
|
||||||
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
|
|
||||||
|
|
||||||
Uses Tesseract + GridDetectionService for spatial positioning.
|
|
||||||
page_number is 0-indexed.
|
|
||||||
"""
|
|
||||||
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
page_count = session.get("pdf_page_count", 1)
|
|
||||||
if page_number < 0 or page_number >= page_count:
|
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
||||||
|
|
||||||
# Convert page to hires image
|
|
||||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
||||||
|
|
||||||
# Deskew image before OCR
|
|
||||||
deskew_angle = 0.0
|
|
||||||
try:
|
|
||||||
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
|
|
||||||
if CV2_AVAILABLE:
|
|
||||||
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
|
|
||||||
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Deskew failed for page {page_number}: {e}")
|
|
||||||
|
|
||||||
# Cache deskewed image in session for later serving
|
|
||||||
if "deskewed_images" not in session:
|
|
||||||
session["deskewed_images"] = {}
|
|
||||||
session["deskewed_images"][str(page_number)] = image_data
|
|
||||||
|
|
||||||
# Extract entries with boxes (now on deskewed image)
|
|
||||||
result = await extract_entries_with_boxes(image_data)
|
|
||||||
|
|
||||||
# Cache in session
|
|
||||||
if "gt_entries" not in session:
|
|
||||||
session["gt_entries"] = {}
|
|
||||||
session["gt_entries"][str(page_number)] = result["entries"]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"entries": result["entries"],
|
|
||||||
"entry_count": len(result["entries"]),
|
|
||||||
"image_width": result["image_width"],
|
|
||||||
"image_height": result["image_height"],
|
|
||||||
"deskew_angle": round(deskew_angle, 2),
|
|
||||||
"deskewed": abs(deskew_angle) > 0.05,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
|
|
||||||
async def get_deskewed_image(session_id: str, page_number: int):
|
|
||||||
"""Return the deskewed page image as PNG.
|
|
||||||
|
|
||||||
Falls back to the original hires image if no deskewed version is cached.
|
|
||||||
"""
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
deskewed = session.get("deskewed_images", {}).get(str(page_number))
|
|
||||||
|
|
||||||
if deskewed:
|
|
||||||
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
|
|
||||||
|
|
||||||
# Fallback: render original hires image
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
||||||
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Ground Truth Labeling
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
|
||||||
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
|
||||||
"""Save ground truth labels for a page.
|
|
||||||
|
|
||||||
Expects body with 'entries' list - each entry has english, german, example,
|
|
||||||
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
|
|
||||||
"""
|
|
||||||
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
entries = data.get("entries", [])
|
|
||||||
if not entries:
|
|
||||||
raise HTTPException(status_code=400, detail="No entries provided")
|
|
||||||
|
|
||||||
# Save in session
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
if "ground_truth" not in session:
|
|
||||||
session["ground_truth"] = {}
|
|
||||||
session["ground_truth"][str(page_number)] = entries
|
|
||||||
|
|
||||||
# Also save to disk
|
|
||||||
os.makedirs(_ground_truth_dir(), exist_ok=True)
|
|
||||||
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
|
||||||
gt_data = {
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_number": page_number,
|
|
||||||
"saved_at": datetime.now().isoformat(),
|
|
||||||
"entry_count": len(entries),
|
|
||||||
"entries": entries,
|
|
||||||
}
|
|
||||||
with open(gt_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(gt_data, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
|
|
||||||
|
|
||||||
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
|
|
||||||
edited = sum(1 for e in entries if e.get("status") == "edited")
|
|
||||||
skipped = sum(1 for e in entries if e.get("status") == "skipped")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"saved_count": len(entries),
|
|
||||||
"confirmed": confirmed,
|
|
||||||
"edited": edited,
|
|
||||||
"skipped": skipped,
|
|
||||||
"file_path": gt_path,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
|
|
||||||
async def load_ground_truth(session_id: str, page_number: int):
|
|
||||||
"""Load saved ground truth for a page."""
|
|
||||||
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
# Try session cache first
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
cached = session.get("ground_truth", {}).get(str(page_number))
|
|
||||||
if cached:
|
|
||||||
return {"success": True, "entries": cached, "source": "cache"}
|
|
||||||
|
|
||||||
# Try disk
|
|
||||||
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
|
||||||
if not os.path.exists(gt_path):
|
|
||||||
raise HTTPException(status_code=404, detail="No ground truth found for this page")
|
|
||||||
|
|
||||||
with open(gt_path, 'r', encoding='utf-8') as f:
|
|
||||||
gt_data = json.load(f)
|
|
||||||
|
|
||||||
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
|
|
||||||
|
|
||||||
|
|
||||||
# ─── Learning Module Generation ─────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class GenerateLearningUnitRequest(BaseModel):
|
|
||||||
grade: Optional[str] = None
|
|
||||||
generate_modules: bool = True
|
|
||||||
|
|
||||||
|
|
||||||
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
|
|
||||||
async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
|
|
||||||
"""
|
|
||||||
Create a Learning Unit from the vocabulary in this session.
|
|
||||||
|
|
||||||
1. Takes vocabulary from the session
|
|
||||||
2. Creates a Learning Unit in backend-lehrer
|
|
||||||
3. Optionally triggers MC/Cloze/QA generation
|
|
||||||
|
|
||||||
Returns the created unit info and generation status.
|
|
||||||
"""
|
|
||||||
if request is None:
|
|
||||||
request = GenerateLearningUnitRequest()
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
vocabulary = session.get("vocabulary", [])
|
|
||||||
|
|
||||||
if not vocabulary:
|
|
||||||
raise HTTPException(status_code=400, detail="No vocabulary in this session")
|
|
||||||
|
|
||||||
try:
|
|
||||||
from vocab_learn_bridge import create_learning_unit, generate_learning_modules
|
|
||||||
|
|
||||||
# Step 1: Create Learning Unit
|
|
||||||
result = await create_learning_unit(
|
|
||||||
session_name=session["name"],
|
|
||||||
vocabulary=vocabulary,
|
|
||||||
grade=request.grade,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Step 2: Generate modules if requested
|
|
||||||
if request.generate_modules:
|
|
||||||
try:
|
|
||||||
gen_result = await generate_learning_modules(
|
|
||||||
unit_id=result["unit_id"],
|
|
||||||
analysis_path=result["analysis_path"],
|
|
||||||
)
|
|
||||||
result["generation"] = gen_result
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Module generation failed (unit created): {e}")
|
|
||||||
result["generation"] = {"status": "error", "reason": str(e)}
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
|
|
||||||
except ValueError as e:
|
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
|
||||||
except RuntimeError as e:
|
|
||||||
raise HTTPException(status_code=502, detail=str(e))
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Include compare_ocr_methods & analyze_grid from companion module
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
from vocab_worksheet_compare_api import compare_router # noqa: E402
|
|
||||||
|
|
||||||
analysis_router.include_router(compare_router)
|
|
||||||
|
|||||||
@@ -1,499 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/worksheet/api.py
|
||||||
Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
|
import importlib as _importlib
|
||||||
vocabulary editing, worksheet generation, and PDF downloads.
|
import sys as _sys
|
||||||
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.api")
|
||||||
Sub-routers (included at bottom):
|
|
||||||
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
|
|
||||||
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
|
|
||||||
"""
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Query
|
|
||||||
from fastapi.responses import StreamingResponse
|
|
||||||
from typing import List, Dict, Any
|
|
||||||
from datetime import datetime
|
|
||||||
import uuid
|
|
||||||
import os
|
|
||||||
import io
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# --- Imports from extracted sub-modules ---
|
|
||||||
from vocab_worksheet_models import (
|
|
||||||
WorksheetType,
|
|
||||||
SessionStatus,
|
|
||||||
VocabularyEntry,
|
|
||||||
SessionCreate,
|
|
||||||
SessionResponse,
|
|
||||||
VocabularyResponse,
|
|
||||||
VocabularyUpdate,
|
|
||||||
WorksheetGenerateRequest,
|
|
||||||
WorksheetResponse,
|
|
||||||
)
|
|
||||||
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
|
||||||
from vocab_worksheet_generation import (
|
|
||||||
generate_worksheet_html, generate_worksheet_pdf,
|
|
||||||
convert_pdf_page_to_image,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Database integration (used by main.py lifespan) ---
|
|
||||||
try:
|
|
||||||
from vocab_session_store import (
|
|
||||||
DATABASE_URL, get_pool, init_vocab_tables,
|
|
||||||
list_sessions_db, get_session_db,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
|
|
||||||
get_pool = None
|
|
||||||
init_vocab_tables = None
|
|
||||||
list_sessions_db = None
|
|
||||||
get_session_db = None
|
|
||||||
|
|
||||||
_db_pool = None
|
|
||||||
|
|
||||||
|
|
||||||
def set_db_pool(pool):
|
|
||||||
"""Set the database connection pool (called from main.py lifespan)."""
|
|
||||||
global _db_pool
|
|
||||||
_db_pool = pool
|
|
||||||
|
|
||||||
|
|
||||||
async def _init_vocab_table():
|
|
||||||
"""Initialize vocab tables in database."""
|
|
||||||
if init_vocab_tables:
|
|
||||||
try:
|
|
||||||
await init_vocab_tables()
|
|
||||||
logger.info("vocab_session_cache table ready")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to init vocab tables: {e}")
|
|
||||||
else:
|
|
||||||
logger.info("vocab_session_cache table ready")
|
|
||||||
|
|
||||||
|
|
||||||
async def _load_all_sessions():
|
|
||||||
"""Load all vocab sessions from database into memory cache."""
|
|
||||||
if not list_sessions_db:
|
|
||||||
logger.info("Loaded 0 vocab sessions from database")
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
sessions = await list_sessions_db(limit=500)
|
|
||||||
count = 0
|
|
||||||
for s in sessions:
|
|
||||||
sid = s.get("id") or s.get("session_id")
|
|
||||||
if sid and sid not in _sessions:
|
|
||||||
_sessions[sid] = {
|
|
||||||
"id": sid,
|
|
||||||
"name": s.get("name", ""),
|
|
||||||
"description": s.get("description", ""),
|
|
||||||
"status": s.get("status", "created"),
|
|
||||||
"vocabulary_count": s.get("vocabulary_count", 0),
|
|
||||||
"source_language": s.get("source_language", "en"),
|
|
||||||
"target_language": s.get("target_language", "de"),
|
|
||||||
"created_at": str(s.get("created_at", "")),
|
|
||||||
}
|
|
||||||
count += 1
|
|
||||||
logger.info(f"Loaded {count} vocab sessions from database")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to load sessions from database: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
# --- Router & module-level state ---
|
|
||||||
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
|
|
||||||
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
|
||||||
_sessions: Dict[str, Dict[str, Any]] = {}
|
|
||||||
_worksheets: Dict[str, Dict[str, Any]] = {}
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions", response_model=SessionResponse)
|
|
||||||
async def create_session(session: SessionCreate):
|
|
||||||
"""Create a new vocabulary extraction session."""
|
|
||||||
session_id = str(uuid.uuid4())
|
|
||||||
|
|
||||||
session_data = {
|
|
||||||
"id": session_id,
|
|
||||||
"name": session.name,
|
|
||||||
"description": session.description,
|
|
||||||
"source_language": session.source_language,
|
|
||||||
"target_language": session.target_language,
|
|
||||||
"status": SessionStatus.PENDING.value,
|
|
||||||
"vocabulary": [],
|
|
||||||
"vocabulary_count": 0,
|
|
||||||
"image_path": None,
|
|
||||||
"extraction_confidence": None,
|
|
||||||
"created_at": datetime.utcnow(),
|
|
||||||
}
|
|
||||||
|
|
||||||
_sessions[session_id] = session_data
|
|
||||||
|
|
||||||
# Create storage directory
|
|
||||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
||||||
os.makedirs(session_dir, exist_ok=True)
|
|
||||||
|
|
||||||
return SessionResponse(
|
|
||||||
id=session_id,
|
|
||||||
name=session.name,
|
|
||||||
description=session.description,
|
|
||||||
source_language=session.source_language,
|
|
||||||
target_language=session.target_language,
|
|
||||||
status=SessionStatus.PENDING.value,
|
|
||||||
vocabulary_count=0,
|
|
||||||
image_path=None,
|
|
||||||
created_at=session_data["created_at"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/sessions", response_model=List[SessionResponse])
|
|
||||||
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
|
|
||||||
"""List all vocabulary sessions."""
|
|
||||||
sessions = sorted(
|
|
||||||
_sessions.values(),
|
|
||||||
key=lambda x: x["created_at"],
|
|
||||||
reverse=True
|
|
||||||
)[:limit]
|
|
||||||
|
|
||||||
return [
|
|
||||||
SessionResponse(
|
|
||||||
id=s["id"],
|
|
||||||
name=s["name"],
|
|
||||||
description=s.get("description"),
|
|
||||||
source_language=s["source_language"],
|
|
||||||
target_language=s["target_language"],
|
|
||||||
status=s["status"],
|
|
||||||
vocabulary_count=s.get("vocabulary_count", 0),
|
|
||||||
image_path=s.get("image_path"),
|
|
||||||
created_at=s["created_at"],
|
|
||||||
)
|
|
||||||
for s in sessions
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/sessions/{session_id}", response_model=SessionResponse)
|
|
||||||
async def get_session(session_id: str):
|
|
||||||
"""Get a specific session."""
|
|
||||||
if session_id not in _sessions:
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
s = _sessions[session_id]
|
|
||||||
return SessionResponse(
|
|
||||||
id=s["id"],
|
|
||||||
name=s["name"],
|
|
||||||
description=s.get("description"),
|
|
||||||
source_language=s["source_language"],
|
|
||||||
target_language=s["target_language"],
|
|
||||||
status=s["status"],
|
|
||||||
vocabulary_count=s.get("vocabulary_count", 0),
|
|
||||||
image_path=s.get("image_path"),
|
|
||||||
created_at=s["created_at"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/upload")
|
|
||||||
async def upload_image(
|
|
||||||
session_id: str,
|
|
||||||
file: UploadFile = File(...),
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Upload a textbook page image or PDF and extract vocabulary.
|
|
||||||
|
|
||||||
Supported formats: PNG, JPG, JPEG, PDF
|
|
||||||
"""
|
|
||||||
logger.info(f"Upload request for session {session_id}")
|
|
||||||
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
|
|
||||||
|
|
||||||
if session_id not in _sessions:
|
|
||||||
logger.error(f"Session {session_id} not found")
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _sessions[session_id]
|
|
||||||
|
|
||||||
# Validate file type - check both extension and content type
|
|
||||||
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
|
||||||
content_type = file.content_type or ''
|
|
||||||
|
|
||||||
# Accept images and PDFs
|
|
||||||
valid_image_extensions = ['png', 'jpg', 'jpeg']
|
|
||||||
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
|
|
||||||
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
|
|
||||||
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
|
|
||||||
|
|
||||||
if not is_pdf and not is_image:
|
|
||||||
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Determine final extension for saving
|
|
||||||
if is_pdf:
|
|
||||||
save_extension = 'png' # PDFs will be converted to PNG
|
|
||||||
elif extension in valid_image_extensions:
|
|
||||||
save_extension = extension
|
|
||||||
elif content_type == 'image/png':
|
|
||||||
save_extension = 'png'
|
|
||||||
else:
|
|
||||||
save_extension = 'jpg'
|
|
||||||
|
|
||||||
# Read file content
|
|
||||||
content = await file.read()
|
|
||||||
logger.info(f"Read {len(content)} bytes from uploaded file")
|
|
||||||
|
|
||||||
# Convert PDF to image if needed
|
|
||||||
if is_pdf:
|
|
||||||
logger.info("Converting PDF to image...")
|
|
||||||
content = await convert_pdf_page_to_image(content, page_number=0)
|
|
||||||
logger.info(f"PDF converted, image size: {len(content)} bytes")
|
|
||||||
|
|
||||||
# Save image
|
|
||||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
||||||
os.makedirs(session_dir, exist_ok=True)
|
|
||||||
image_path = os.path.join(session_dir, f"source.{save_extension}")
|
|
||||||
|
|
||||||
with open(image_path, 'wb') as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
# Update session status
|
|
||||||
session["status"] = SessionStatus.PROCESSING.value
|
|
||||||
session["image_path"] = image_path
|
|
||||||
|
|
||||||
# Extract vocabulary using Vision LLM
|
|
||||||
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
|
|
||||||
|
|
||||||
# Update session with extracted vocabulary
|
|
||||||
session["vocabulary"] = [v.dict() for v in vocabulary]
|
|
||||||
session["vocabulary_count"] = len(vocabulary)
|
|
||||||
session["extraction_confidence"] = confidence
|
|
||||||
session["status"] = SessionStatus.EXTRACTED.value
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"session_id": session_id,
|
|
||||||
"filename": file.filename,
|
|
||||||
"image_path": image_path,
|
|
||||||
"vocabulary_count": len(vocabulary),
|
|
||||||
"extraction_confidence": confidence,
|
|
||||||
"status": SessionStatus.EXTRACTED.value,
|
|
||||||
}
|
|
||||||
|
|
||||||
if error:
|
|
||||||
result["error"] = error
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
|
|
||||||
async def get_vocabulary(session_id: str):
|
|
||||||
"""Get extracted vocabulary for a session."""
|
|
||||||
if session_id not in _sessions:
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
session = _sessions[session_id]
|
|
||||||
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
|
||||||
return VocabularyResponse(
|
|
||||||
session_id=session_id,
|
|
||||||
vocabulary=vocabulary,
|
|
||||||
extraction_confidence=session.get("extraction_confidence"),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.put("/sessions/{session_id}/vocabulary")
|
|
||||||
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
|
|
||||||
"""Update vocabulary entries (for manual corrections)."""
|
|
||||||
if session_id not in _sessions:
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _sessions[session_id]
|
|
||||||
session["vocabulary"] = [v.dict() for v in update.vocabulary]
|
|
||||||
session["vocabulary_count"] = len(update.vocabulary)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"session_id": session_id,
|
|
||||||
"vocabulary_count": len(update.vocabulary),
|
|
||||||
"message": "Vocabulary updated successfully",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
|
|
||||||
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
|
|
||||||
"""Generate worksheet PDF(s) from extracted vocabulary."""
|
|
||||||
if session_id not in _sessions:
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _sessions[session_id]
|
|
||||||
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
|
||||||
|
|
||||||
if not vocabulary:
|
|
||||||
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
|
|
||||||
|
|
||||||
worksheet_id = str(uuid.uuid4())
|
|
||||||
title = request.title or session["name"]
|
|
||||||
|
|
||||||
# Generate HTML for each worksheet type
|
|
||||||
combined_html = ""
|
|
||||||
for wtype in request.worksheet_types:
|
|
||||||
html = generate_worksheet_html(
|
|
||||||
vocabulary=vocabulary,
|
|
||||||
worksheet_type=wtype,
|
|
||||||
title=f"{title} - {wtype.value}",
|
|
||||||
show_solutions=False,
|
|
||||||
repetitions=request.repetitions,
|
|
||||||
line_height=request.line_height,
|
|
||||||
)
|
|
||||||
combined_html += html + '<div style="page-break-after: always;"></div>'
|
|
||||||
|
|
||||||
# Generate PDF
|
|
||||||
try:
|
|
||||||
pdf_bytes = await generate_worksheet_pdf(combined_html)
|
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
|
|
||||||
|
|
||||||
# Save PDF
|
|
||||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
||||||
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
|
|
||||||
with open(pdf_path, 'wb') as f:
|
|
||||||
f.write(pdf_bytes)
|
|
||||||
|
|
||||||
# Generate solution PDF if requested
|
|
||||||
solution_path = None
|
|
||||||
if request.include_solutions:
|
|
||||||
solution_html = ""
|
|
||||||
for wtype in request.worksheet_types:
|
|
||||||
html = generate_worksheet_html(
|
|
||||||
vocabulary=vocabulary,
|
|
||||||
worksheet_type=wtype,
|
|
||||||
title=f"{title} - {wtype.value} (Loesung)",
|
|
||||||
show_solutions=True,
|
|
||||||
repetitions=request.repetitions,
|
|
||||||
line_height=request.line_height,
|
|
||||||
)
|
|
||||||
solution_html += html + '<div style="page-break-after: always;"></div>'
|
|
||||||
|
|
||||||
solution_bytes = await generate_worksheet_pdf(solution_html)
|
|
||||||
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
|
|
||||||
with open(solution_path, 'wb') as f:
|
|
||||||
f.write(solution_bytes)
|
|
||||||
|
|
||||||
# Store worksheet info
|
|
||||||
worksheet_data = {
|
|
||||||
"id": worksheet_id,
|
|
||||||
"session_id": session_id,
|
|
||||||
"worksheet_types": [wt.value for wt in request.worksheet_types],
|
|
||||||
"pdf_path": pdf_path,
|
|
||||||
"solution_path": solution_path,
|
|
||||||
"generated_at": datetime.utcnow(),
|
|
||||||
}
|
|
||||||
_worksheets[worksheet_id] = worksheet_data
|
|
||||||
|
|
||||||
# Update session status
|
|
||||||
session["status"] = SessionStatus.COMPLETED.value
|
|
||||||
|
|
||||||
return WorksheetResponse(
|
|
||||||
id=worksheet_id,
|
|
||||||
session_id=session_id,
|
|
||||||
worksheet_types=worksheet_data["worksheet_types"],
|
|
||||||
pdf_path=pdf_path,
|
|
||||||
solution_path=solution_path,
|
|
||||||
generated_at=worksheet_data["generated_at"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/worksheets/{worksheet_id}/pdf")
|
|
||||||
async def download_worksheet_pdf(worksheet_id: str):
|
|
||||||
"""Download the generated worksheet PDF."""
|
|
||||||
if worksheet_id not in _worksheets:
|
|
||||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
||||||
|
|
||||||
worksheet = _worksheets[worksheet_id]
|
|
||||||
pdf_path = worksheet["pdf_path"]
|
|
||||||
|
|
||||||
if not os.path.exists(pdf_path):
|
|
||||||
raise HTTPException(status_code=404, detail="PDF file not found")
|
|
||||||
|
|
||||||
with open(pdf_path, 'rb') as f:
|
|
||||||
pdf_bytes = f.read()
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
io.BytesIO(pdf_bytes),
|
|
||||||
media_type="application/pdf",
|
|
||||||
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/worksheets/{worksheet_id}/solution")
|
|
||||||
async def download_solution_pdf(worksheet_id: str):
|
|
||||||
"""Download the solution PDF."""
|
|
||||||
if worksheet_id not in _worksheets:
|
|
||||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
||||||
|
|
||||||
worksheet = _worksheets[worksheet_id]
|
|
||||||
solution_path = worksheet.get("solution_path")
|
|
||||||
|
|
||||||
if not solution_path or not os.path.exists(solution_path):
|
|
||||||
raise HTTPException(status_code=404, detail="Solution PDF not found")
|
|
||||||
|
|
||||||
with open(solution_path, 'rb') as f:
|
|
||||||
pdf_bytes = f.read()
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
io.BytesIO(pdf_bytes),
|
|
||||||
media_type="application/pdf",
|
|
||||||
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/sessions/{session_id}/image")
|
|
||||||
async def get_session_image(session_id: str):
|
|
||||||
"""Get the uploaded source image for a session."""
|
|
||||||
if session_id not in _sessions:
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _sessions[session_id]
|
|
||||||
image_path = session.get("image_path")
|
|
||||||
|
|
||||||
if not image_path or not os.path.exists(image_path):
|
|
||||||
raise HTTPException(status_code=404, detail="Image not found")
|
|
||||||
|
|
||||||
# Determine content type
|
|
||||||
extension = image_path.split('.')[-1].lower()
|
|
||||||
content_type = {
|
|
||||||
'png': 'image/png',
|
|
||||||
'jpg': 'image/jpeg',
|
|
||||||
'jpeg': 'image/jpeg',
|
|
||||||
}.get(extension, 'application/octet-stream')
|
|
||||||
|
|
||||||
with open(image_path, 'rb') as f:
|
|
||||||
image_bytes = f.read()
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
io.BytesIO(image_bytes),
|
|
||||||
media_type=content_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.delete("/sessions/{session_id}")
|
|
||||||
async def delete_session(session_id: str):
|
|
||||||
"""Delete a vocabulary session and all associated files."""
|
|
||||||
if session_id not in _sessions:
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
# Delete session directory
|
|
||||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
||||||
if os.path.exists(session_dir):
|
|
||||||
import shutil
|
|
||||||
shutil.rmtree(session_dir)
|
|
||||||
|
|
||||||
# Remove from storage
|
|
||||||
del _sessions[session_id]
|
|
||||||
|
|
||||||
# Remove associated worksheets
|
|
||||||
for wid, ws in list(_worksheets.items()):
|
|
||||||
if ws["session_id"] == session_id:
|
|
||||||
del _worksheets[wid]
|
|
||||||
|
|
||||||
return {"message": "Session deleted successfully", "session_id": session_id}
|
|
||||||
|
|
||||||
|
|
||||||
# --- Include sub-routers ---
|
|
||||||
from vocab_worksheet_upload_api import upload_router
|
|
||||||
from vocab_worksheet_analysis_api import analysis_router
|
|
||||||
|
|
||||||
router.include_router(upload_router)
|
|
||||||
router.include_router(analysis_router)
|
|
||||||
|
|||||||
@@ -1,545 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/worksheet/compare_api.py
|
||||||
Vocabulary Worksheet Compare & Grid Analysis API.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Split from vocab_worksheet_analysis_api.py — contains the two largest
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.compare_api")
|
||||||
route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
|
|
||||||
"""
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Query
|
|
||||||
from typing import Dict, Any
|
|
||||||
import base64
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
|
||||||
|
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
||||||
VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
|
|
||||||
|
|
||||||
def _get_sessions():
|
|
||||||
from vocab_worksheet_api import _sessions
|
|
||||||
return _sessions
|
|
||||||
from vocab_worksheet_generation import convert_pdf_page_to_image
|
|
||||||
|
|
||||||
# Try to import Tesseract extractor
|
|
||||||
try:
|
|
||||||
from tesseract_vocab_extractor import (
|
|
||||||
run_tesseract_pipeline,
|
|
||||||
match_positions_to_vocab, TESSERACT_AVAILABLE,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
TESSERACT_AVAILABLE = False
|
|
||||||
|
|
||||||
# Try to import CV Pipeline
|
|
||||||
try:
|
|
||||||
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
|
|
||||||
except ImportError:
|
|
||||||
CV_PIPELINE_AVAILABLE = False
|
|
||||||
|
|
||||||
# Try to import Grid Detection Service
|
|
||||||
try:
|
|
||||||
from services.grid_detection_service import GridDetectionService
|
|
||||||
GRID_SERVICE_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
GRID_SERVICE_AVAILABLE = False
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
compare_router = APIRouter()
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# OCR Compare & Grid Analysis Endpoints
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
|
|
||||||
async def compare_ocr_methods(session_id: str, page_number: int):
|
|
||||||
"""
|
|
||||||
Run multiple OCR methods on a page and compare results.
|
|
||||||
|
|
||||||
This endpoint:
|
|
||||||
1. Gets the page image from the session's uploaded PDF
|
|
||||||
2. Runs Vision LLM extraction (primary method)
|
|
||||||
3. Optionally runs Tesseract extraction
|
|
||||||
4. Compares found vocabulary across methods
|
|
||||||
5. Returns structured comparison results
|
|
||||||
|
|
||||||
page_number is 0-indexed.
|
|
||||||
"""
|
|
||||||
import httpx
|
|
||||||
import time
|
|
||||||
|
|
||||||
logger.info(f"Compare OCR for session {session_id}, page {page_number}")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
page_count = session.get("pdf_page_count", 1)
|
|
||||||
if page_number < 0 or page_number >= page_count:
|
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
||||||
|
|
||||||
# Convert page to image
|
|
||||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
||||||
|
|
||||||
methods_results = {}
|
|
||||||
all_vocab_sets = {}
|
|
||||||
|
|
||||||
# --- Method: Vision LLM ---
|
|
||||||
try:
|
|
||||||
start = time.time()
|
|
||||||
vocab, confidence, error = await extract_vocabulary_from_image(
|
|
||||||
image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
|
|
||||||
)
|
|
||||||
duration = time.time() - start
|
|
||||||
|
|
||||||
vocab_list = []
|
|
||||||
for v in vocab:
|
|
||||||
entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
|
|
||||||
vocab_list.append({
|
|
||||||
"english": entry.get("english", ""),
|
|
||||||
"german": entry.get("german", ""),
|
|
||||||
"example": entry.get("example_sentence", ""),
|
|
||||||
})
|
|
||||||
|
|
||||||
methods_results["vision_llm"] = {
|
|
||||||
"name": "Vision LLM",
|
|
||||||
"model": VISION_MODEL,
|
|
||||||
"duration_seconds": round(duration, 1),
|
|
||||||
"vocabulary_count": len(vocab_list),
|
|
||||||
"vocabulary": vocab_list,
|
|
||||||
"confidence": confidence,
|
|
||||||
"success": len(vocab_list) > 0 and not error,
|
|
||||||
"error": error if error else None,
|
|
||||||
}
|
|
||||||
all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Vision LLM failed: {e}")
|
|
||||||
methods_results["vision_llm"] = {
|
|
||||||
"name": "Vision LLM",
|
|
||||||
"model": VISION_MODEL,
|
|
||||||
"duration_seconds": 0,
|
|
||||||
"vocabulary_count": 0,
|
|
||||||
"vocabulary": [],
|
|
||||||
"confidence": 0,
|
|
||||||
"success": False,
|
|
||||||
"error": str(e),
|
|
||||||
}
|
|
||||||
all_vocab_sets["vision_llm"] = set()
|
|
||||||
|
|
||||||
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
|
|
||||||
if TESSERACT_AVAILABLE:
|
|
||||||
try:
|
|
||||||
start = time.time()
|
|
||||||
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
|
|
||||||
duration = time.time() - start
|
|
||||||
|
|
||||||
tess_vocab = tess_result.get("vocabulary", [])
|
|
||||||
tess_words = tess_result.get("words", [])
|
|
||||||
|
|
||||||
# Store Tesseract words in session for later use (grid analysis, position matching)
|
|
||||||
session["tesseract_words"] = tess_words
|
|
||||||
session["tesseract_image_width"] = tess_result.get("image_width", 0)
|
|
||||||
session["tesseract_image_height"] = tess_result.get("image_height", 0)
|
|
||||||
session[f"tesseract_page_{page_number}"] = tess_result
|
|
||||||
|
|
||||||
vocab_list_tess = []
|
|
||||||
for v in tess_vocab:
|
|
||||||
vocab_list_tess.append({
|
|
||||||
"english": v.get("english", ""),
|
|
||||||
"german": v.get("german", ""),
|
|
||||||
"example": v.get("example", ""),
|
|
||||||
})
|
|
||||||
|
|
||||||
methods_results["tesseract"] = {
|
|
||||||
"name": "Tesseract OCR",
|
|
||||||
"model": "tesseract-ocr (eng+deu)",
|
|
||||||
"duration_seconds": round(duration, 1),
|
|
||||||
"vocabulary_count": len(vocab_list_tess),
|
|
||||||
"vocabulary": vocab_list_tess,
|
|
||||||
"confidence": 0.7 if tess_vocab else 0,
|
|
||||||
"success": len(vocab_list_tess) > 0,
|
|
||||||
"error": tess_result.get("error"),
|
|
||||||
"word_count": tess_result.get("word_count", 0),
|
|
||||||
"columns_detected": len(tess_result.get("columns", [])),
|
|
||||||
}
|
|
||||||
all_vocab_sets["tesseract"] = {
|
|
||||||
(v["english"].lower().strip(), v["german"].lower().strip())
|
|
||||||
for v in vocab_list_tess if v["english"] and v["german"]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
|
|
||||||
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
|
|
||||||
llm_vocab_with_bbox = match_positions_to_vocab(
|
|
||||||
tess_words,
|
|
||||||
methods_results["vision_llm"]["vocabulary"],
|
|
||||||
tess_result.get("image_width", 1),
|
|
||||||
tess_result.get("image_height", 1),
|
|
||||||
)
|
|
||||||
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Tesseract failed: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
methods_results["tesseract"] = {
|
|
||||||
"name": "Tesseract OCR",
|
|
||||||
"model": "tesseract-ocr",
|
|
||||||
"duration_seconds": 0,
|
|
||||||
"vocabulary_count": 0,
|
|
||||||
"vocabulary": [],
|
|
||||||
"confidence": 0,
|
|
||||||
"success": False,
|
|
||||||
"error": str(e),
|
|
||||||
}
|
|
||||||
all_vocab_sets["tesseract"] = set()
|
|
||||||
|
|
||||||
# --- Method: CV Pipeline (Document Reconstruction) ---
|
|
||||||
if CV_PIPELINE_AVAILABLE:
|
|
||||||
try:
|
|
||||||
start = time.time()
|
|
||||||
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
|
|
||||||
duration = time.time() - start
|
|
||||||
|
|
||||||
cv_vocab = cv_result.vocabulary if not cv_result.error else []
|
|
||||||
vocab_list_cv = []
|
|
||||||
for v in cv_vocab:
|
|
||||||
vocab_list_cv.append({
|
|
||||||
"english": v.get("english", ""),
|
|
||||||
"german": v.get("german", ""),
|
|
||||||
"example": v.get("example", ""),
|
|
||||||
})
|
|
||||||
|
|
||||||
methods_results["cv_pipeline"] = {
|
|
||||||
"name": "CV Pipeline (Document Reconstruction)",
|
|
||||||
"model": "opencv + tesseract (multi-pass)",
|
|
||||||
"duration_seconds": round(duration, 1),
|
|
||||||
"vocabulary_count": len(vocab_list_cv),
|
|
||||||
"vocabulary": vocab_list_cv,
|
|
||||||
"confidence": 0.8 if cv_vocab else 0,
|
|
||||||
"success": len(vocab_list_cv) > 0,
|
|
||||||
"error": cv_result.error,
|
|
||||||
"word_count": cv_result.word_count,
|
|
||||||
"columns_detected": cv_result.columns_detected,
|
|
||||||
"stages": cv_result.stages,
|
|
||||||
}
|
|
||||||
all_vocab_sets["cv_pipeline"] = {
|
|
||||||
(v["english"].lower().strip(), v["german"].lower().strip())
|
|
||||||
for v in vocab_list_cv if v["english"] and v["german"]
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"CV Pipeline failed: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
methods_results["cv_pipeline"] = {
|
|
||||||
"name": "CV Pipeline (Document Reconstruction)",
|
|
||||||
"model": "opencv + tesseract (multi-pass)",
|
|
||||||
"duration_seconds": 0,
|
|
||||||
"vocabulary_count": 0,
|
|
||||||
"vocabulary": [],
|
|
||||||
"confidence": 0,
|
|
||||||
"success": False,
|
|
||||||
"error": str(e),
|
|
||||||
}
|
|
||||||
all_vocab_sets["cv_pipeline"] = set()
|
|
||||||
|
|
||||||
# --- Build comparison ---
|
|
||||||
all_unique = set()
|
|
||||||
for vs in all_vocab_sets.values():
|
|
||||||
all_unique |= vs
|
|
||||||
|
|
||||||
found_by_all = []
|
|
||||||
found_by_some = []
|
|
||||||
for english, german in sorted(all_unique):
|
|
||||||
found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
|
|
||||||
entry = {"english": english, "german": german, "methods": found_in}
|
|
||||||
if len(found_in) == len(all_vocab_sets):
|
|
||||||
found_by_all.append(entry)
|
|
||||||
else:
|
|
||||||
found_by_some.append(entry)
|
|
||||||
|
|
||||||
total_methods = max(len(all_vocab_sets), 1)
|
|
||||||
agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
|
|
||||||
|
|
||||||
# Find best method
|
|
||||||
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
|
|
||||||
|
|
||||||
return {
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_number": page_number,
|
|
||||||
"methods": methods_results,
|
|
||||||
"comparison": {
|
|
||||||
"found_by_all_methods": found_by_all,
|
|
||||||
"found_by_some_methods": found_by_some,
|
|
||||||
"total_unique_vocabulary": len(all_unique),
|
|
||||||
"agreement_rate": agreement_rate,
|
|
||||||
},
|
|
||||||
"recommendation": {
|
|
||||||
"best_method": best_method,
|
|
||||||
"reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
|
|
||||||
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
|
|
||||||
"""
|
|
||||||
Analyze the grid/table structure of a vocabulary page.
|
|
||||||
|
|
||||||
Hybrid approach:
|
|
||||||
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
|
|
||||||
real spatial positions via GridDetectionService.
|
|
||||||
2. Otherwise fall back to Vision LLM for grid structure detection.
|
|
||||||
|
|
||||||
page_number is 0-indexed.
|
|
||||||
Returns GridData structure expected by the frontend GridOverlay component.
|
|
||||||
"""
|
|
||||||
import httpx
|
|
||||||
import time
|
|
||||||
|
|
||||||
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
page_count = session.get("pdf_page_count", 1)
|
|
||||||
if page_number < 0 or page_number >= page_count:
|
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid page number.")
|
|
||||||
|
|
||||||
# Convert page to image
|
|
||||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
||||||
|
|
||||||
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
|
|
||||||
tess_page_data = session.get(f"tesseract_page_{page_number}")
|
|
||||||
|
|
||||||
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
|
|
||||||
try:
|
|
||||||
# Run Tesseract if not already cached
|
|
||||||
if not tess_page_data:
|
|
||||||
logger.info("Running Tesseract for grid analysis (not cached)")
|
|
||||||
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
|
|
||||||
tess_page_data = await _run_tess(image_data, lang="eng+deu")
|
|
||||||
session[f"tesseract_page_{page_number}"] = tess_page_data
|
|
||||||
session["tesseract_words"] = tess_page_data.get("words", [])
|
|
||||||
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
|
|
||||||
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
|
|
||||||
|
|
||||||
tess_words = tess_page_data.get("words", [])
|
|
||||||
img_w = tess_page_data.get("image_width", 0)
|
|
||||||
img_h = tess_page_data.get("image_height", 0)
|
|
||||||
|
|
||||||
if tess_words and img_w > 0 and img_h > 0:
|
|
||||||
service = GridDetectionService()
|
|
||||||
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
|
|
||||||
|
|
||||||
if regions:
|
|
||||||
grid_result = service.detect_grid(regions)
|
|
||||||
grid_dict = grid_result.to_dict()
|
|
||||||
|
|
||||||
# Merge LLM text if available (better quality than Tesseract text)
|
|
||||||
# The LLM vocab was stored during compare-ocr
|
|
||||||
grid_dict["source"] = "tesseract+grid_service"
|
|
||||||
grid_dict["word_count"] = len(tess_words)
|
|
||||||
|
|
||||||
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
|
|
||||||
f"{grid_result.stats.get('recognized', 0)} recognized")
|
|
||||||
|
|
||||||
return {"success": True, "grid": grid_dict}
|
|
||||||
|
|
||||||
logger.info("Tesseract data insufficient, falling back to LLM")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
|
|
||||||
# --- Strategy 2: Fall back to Vision LLM ---
|
|
||||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|
||||||
|
|
||||||
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
|
|
||||||
|
|
||||||
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
|
|
||||||
|
|
||||||
Return a JSON object with this EXACT structure:
|
|
||||||
{
|
|
||||||
"rows": <number of rows>,
|
|
||||||
"columns": <number of columns>,
|
|
||||||
"column_types": ["english", "german", "example"],
|
|
||||||
"entries": [
|
|
||||||
{
|
|
||||||
"row": 0,
|
|
||||||
"col": 0,
|
|
||||||
"text": "the word or phrase in this cell",
|
|
||||||
"column_type": "english",
|
|
||||||
"confidence": 0.95
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
Rules:
|
|
||||||
- row and col are 0-indexed
|
|
||||||
- column_type is one of: "english", "german", "example", "unknown"
|
|
||||||
- Detect whether each column contains English words, German translations, or example sentences
|
|
||||||
- Include ALL non-empty cells
|
|
||||||
- confidence is 0.0-1.0 based on how clear the text is
|
|
||||||
- If a cell is empty, don't include it
|
|
||||||
- Return ONLY the JSON, no other text"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
raw_text = ""
|
|
||||||
max_retries = 3
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
||||||
response = await client.post(
|
|
||||||
f"{OLLAMA_URL}/api/chat",
|
|
||||||
json={
|
|
||||||
"model": VISION_MODEL,
|
|
||||||
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
|
|
||||||
"stream": False,
|
|
||||||
"options": {"temperature": 0.1, "num_predict": 8192},
|
|
||||||
},
|
|
||||||
timeout=300.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code == 500 and attempt < max_retries - 1:
|
|
||||||
wait_time = 10 * (attempt + 1)
|
|
||||||
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
|
||||||
await asyncio.sleep(wait_time)
|
|
||||||
continue
|
|
||||||
elif response.status_code != 200:
|
|
||||||
error_detail = response.text[:200] if response.text else "Unknown error"
|
|
||||||
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
|
|
||||||
|
|
||||||
raw_text = response.json().get("message", {}).get("content", "")
|
|
||||||
break
|
|
||||||
|
|
||||||
# Parse JSON from response
|
|
||||||
import re
|
|
||||||
json_match = re.search(r'\{[\s\S]*\}', raw_text)
|
|
||||||
if not json_match:
|
|
||||||
return {"success": False, "error": "Could not parse grid structure from LLM response"}
|
|
||||||
|
|
||||||
grid_raw = json.loads(json_match.group())
|
|
||||||
|
|
||||||
num_rows = grid_raw.get("rows", 0)
|
|
||||||
num_cols = grid_raw.get("columns", 0)
|
|
||||||
column_types = grid_raw.get("column_types", [])
|
|
||||||
entries = grid_raw.get("entries", [])
|
|
||||||
|
|
||||||
if num_rows == 0 or num_cols == 0:
|
|
||||||
return {"success": False, "error": "No grid structure detected"}
|
|
||||||
|
|
||||||
# Ensure column_types has the right length
|
|
||||||
while len(column_types) < num_cols:
|
|
||||||
column_types.append("unknown")
|
|
||||||
|
|
||||||
# Build cell grid with percentage-based coordinates
|
|
||||||
row_height = 100.0 / num_rows
|
|
||||||
col_width = 100.0 / num_cols
|
|
||||||
|
|
||||||
# Track which cells have content
|
|
||||||
cell_map = {}
|
|
||||||
for entry in entries:
|
|
||||||
r = entry.get("row", 0)
|
|
||||||
c = entry.get("col", 0)
|
|
||||||
cell_map[(r, c)] = entry
|
|
||||||
|
|
||||||
cells = []
|
|
||||||
recognized_count = 0
|
|
||||||
empty_count = 0
|
|
||||||
problematic_count = 0
|
|
||||||
|
|
||||||
for r in range(num_rows):
|
|
||||||
row_cells = []
|
|
||||||
for c in range(num_cols):
|
|
||||||
x = c * col_width
|
|
||||||
y = r * row_height
|
|
||||||
|
|
||||||
if (r, c) in cell_map:
|
|
||||||
entry = cell_map[(r, c)]
|
|
||||||
text = entry.get("text", "").strip()
|
|
||||||
conf = entry.get("confidence", 0.8)
|
|
||||||
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
|
|
||||||
|
|
||||||
if text:
|
|
||||||
status = "recognized" if conf >= 0.5 else "problematic"
|
|
||||||
if status == "recognized":
|
|
||||||
recognized_count += 1
|
|
||||||
else:
|
|
||||||
problematic_count += 1
|
|
||||||
else:
|
|
||||||
status = "empty"
|
|
||||||
empty_count += 1
|
|
||||||
else:
|
|
||||||
text = ""
|
|
||||||
conf = 0.0
|
|
||||||
col_type = column_types[c] if c < len(column_types) else "unknown"
|
|
||||||
status = "empty"
|
|
||||||
empty_count += 1
|
|
||||||
|
|
||||||
row_cells.append({
|
|
||||||
"row": r,
|
|
||||||
"col": c,
|
|
||||||
"x": round(x, 2),
|
|
||||||
"y": round(y, 2),
|
|
||||||
"width": round(col_width, 2),
|
|
||||||
"height": round(row_height, 2),
|
|
||||||
"text": text,
|
|
||||||
"confidence": conf,
|
|
||||||
"status": status,
|
|
||||||
"column_type": col_type,
|
|
||||||
})
|
|
||||||
cells.append(row_cells)
|
|
||||||
|
|
||||||
total = num_rows * num_cols
|
|
||||||
coverage = (recognized_count + problematic_count) / max(total, 1)
|
|
||||||
|
|
||||||
# Column and row boundaries as percentages
|
|
||||||
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
|
|
||||||
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
|
|
||||||
|
|
||||||
grid_data = {
|
|
||||||
"rows": num_rows,
|
|
||||||
"columns": num_cols,
|
|
||||||
"cells": cells,
|
|
||||||
"column_types": column_types,
|
|
||||||
"column_boundaries": col_boundaries,
|
|
||||||
"row_boundaries": row_boundaries,
|
|
||||||
"deskew_angle": 0.0,
|
|
||||||
"source": "vision_llm",
|
|
||||||
"stats": {
|
|
||||||
"recognized": recognized_count,
|
|
||||||
"problematic": problematic_count,
|
|
||||||
"empty": empty_count,
|
|
||||||
"manual": 0,
|
|
||||||
"total": total,
|
|
||||||
"coverage": round(coverage, 3),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
return {"success": True, "grid": grid_data}
|
|
||||||
|
|
||||||
except httpx.TimeoutException:
|
|
||||||
logger.error("Grid analysis timed out")
|
|
||||||
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Grid analysis failed: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
|
|
||||||
|
|||||||
@@ -1,325 +1,4 @@
|
|||||||
"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
|
# Backward-compat shim -- module moved to vocab/worksheet/extraction.py
|
||||||
|
import importlib as _importlib
|
||||||
Contains:
|
import sys as _sys
|
||||||
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.extraction")
|
||||||
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
|
|
||||||
- _get_demo_vocabulary(): Demo data for testing
|
|
||||||
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
|
|
||||||
"""
|
|
||||||
|
|
||||||
import base64
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import uuid
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
from vocab_worksheet_models import VocabularyEntry
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Ollama Configuration
|
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
||||||
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Vision LLM Vocabulary Extraction
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
|
|
||||||
|
|
||||||
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
|
|
||||||
|
|
||||||
{
|
|
||||||
"vocabulary": [
|
|
||||||
{
|
|
||||||
"english": "to improve",
|
|
||||||
"german": "verbessern",
|
|
||||||
"example": "I want to improve my English."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
REGELN:
|
|
||||||
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
|
|
||||||
2. Behalte die exakte Schreibweise bei
|
|
||||||
3. Bei fehlenden Beispielsaetzen: "example": null
|
|
||||||
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
|
|
||||||
5. Gib NUR valides JSON zurueck, keine Erklaerungen
|
|
||||||
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
|
|
||||||
|
|
||||||
Beispiel-Output:
|
|
||||||
{
|
|
||||||
"vocabulary": [
|
|
||||||
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
|
|
||||||
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
|
|
||||||
]
|
|
||||||
}"""
|
|
||||||
|
|
||||||
|
|
||||||
async def extract_vocabulary_from_image(
|
|
||||||
image_data: bytes,
|
|
||||||
filename: str,
|
|
||||||
page_number: int = 0,
|
|
||||||
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
|
|
||||||
) -> tuple[List[VocabularyEntry], float, str]:
|
|
||||||
"""
|
|
||||||
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image_data: Image bytes
|
|
||||||
filename: Original filename for logging
|
|
||||||
page_number: 0-indexed page number for error messages
|
|
||||||
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
|
|
||||||
If False, use Vision LLM (slower, better for complex layouts)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (vocabulary_entries, confidence, error_message)
|
|
||||||
error_message is empty string on success
|
|
||||||
"""
|
|
||||||
|
|
||||||
# ==========================================================================
|
|
||||||
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
|
|
||||||
# ==========================================================================
|
|
||||||
if use_hybrid:
|
|
||||||
try:
|
|
||||||
from hybrid_vocab_extractor import extract_vocabulary_hybrid
|
|
||||||
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
|
|
||||||
|
|
||||||
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
|
|
||||||
|
|
||||||
if error:
|
|
||||||
logger.warning(f"Hybrid extraction had issues: {error}")
|
|
||||||
# Fall through to Vision LLM fallback
|
|
||||||
elif vocab_dicts:
|
|
||||||
# Convert dicts to VocabularyEntry objects
|
|
||||||
vocabulary = [
|
|
||||||
VocabularyEntry(
|
|
||||||
id=str(uuid.uuid4()),
|
|
||||||
english=v.get("english", ""),
|
|
||||||
german=v.get("german", ""),
|
|
||||||
example_sentence=v.get("example"),
|
|
||||||
source_page=page_number + 1
|
|
||||||
)
|
|
||||||
for v in vocab_dicts
|
|
||||||
if v.get("english") and v.get("german")
|
|
||||||
]
|
|
||||||
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
|
|
||||||
return vocabulary, confidence, ""
|
|
||||||
|
|
||||||
except ImportError as e:
|
|
||||||
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
|
|
||||||
# ==========================================================================
|
|
||||||
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
|
|
||||||
# ==========================================================================
|
|
||||||
logger.info(f"Using VISION LLM extraction for {filename}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# First check if Ollama is available
|
|
||||||
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
|
||||||
try:
|
|
||||||
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
|
||||||
if health_response.status_code != 200:
|
|
||||||
logger.error(f"Ollama not available at {OLLAMA_URL}")
|
|
||||||
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Ollama health check failed: {e}")
|
|
||||||
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
|
|
||||||
|
|
||||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"model": VISION_MODEL,
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": VOCAB_EXTRACTION_PROMPT,
|
|
||||||
"images": [image_base64]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"stream": False,
|
|
||||||
"options": {
|
|
||||||
"temperature": 0.1,
|
|
||||||
"num_predict": 4096,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
|
|
||||||
|
|
||||||
# Increased timeout for Vision models (they can be slow)
|
|
||||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
|
||||||
response = await client.post(
|
|
||||||
f"{OLLAMA_URL}/api/chat",
|
|
||||||
json=payload,
|
|
||||||
timeout=300.0 # 5 minutes per page
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
data = response.json()
|
|
||||||
extracted_text = data.get("message", {}).get("content", "")
|
|
||||||
|
|
||||||
logger.info(f"Ollama response received: {len(extracted_text)} chars")
|
|
||||||
|
|
||||||
# Parse JSON from response
|
|
||||||
vocabulary = parse_vocabulary_json(extracted_text)
|
|
||||||
|
|
||||||
# Set source_page for each entry
|
|
||||||
for v in vocabulary:
|
|
||||||
v.source_page = page_number + 1
|
|
||||||
|
|
||||||
# Estimate confidence
|
|
||||||
confidence = 0.85 if len(vocabulary) > 0 else 0.1
|
|
||||||
|
|
||||||
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
|
|
||||||
|
|
||||||
return vocabulary, confidence, ""
|
|
||||||
|
|
||||||
except httpx.TimeoutException:
|
|
||||||
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
|
|
||||||
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.error(traceback.format_exc())
|
|
||||||
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
|
|
||||||
|
|
||||||
|
|
||||||
def _get_demo_vocabulary() -> List[VocabularyEntry]:
|
|
||||||
"""Return demo vocabulary for testing when Vision LLM is not available."""
|
|
||||||
demo_entries = [
|
|
||||||
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
|
|
||||||
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
|
|
||||||
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
|
|
||||||
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
|
|
||||||
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
|
|
||||||
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
|
|
||||||
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
|
|
||||||
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
|
|
||||||
]
|
|
||||||
return [
|
|
||||||
VocabularyEntry(
|
|
||||||
id=str(uuid.uuid4()),
|
|
||||||
english=e["english"],
|
|
||||||
german=e["german"],
|
|
||||||
example_sentence=e.get("example"),
|
|
||||||
)
|
|
||||||
for e in demo_entries
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
|
|
||||||
"""Parse vocabulary JSON from LLM response with robust error handling."""
|
|
||||||
|
|
||||||
def clean_json_string(s: str) -> str:
|
|
||||||
"""Clean a JSON string by removing control characters and fixing common issues."""
|
|
||||||
# Remove control characters except newlines and tabs
|
|
||||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
|
||||||
# Replace unescaped newlines within strings with space
|
|
||||||
# This is a simplistic approach - replace actual newlines with escaped ones
|
|
||||||
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
|
||||||
return s
|
|
||||||
|
|
||||||
def try_parse_json(json_str: str) -> dict:
|
|
||||||
"""Try multiple strategies to parse JSON."""
|
|
||||||
# Strategy 1: Direct parse
|
|
||||||
try:
|
|
||||||
return json.loads(json_str)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Strategy 2: Clean and parse
|
|
||||||
try:
|
|
||||||
cleaned = clean_json_string(json_str)
|
|
||||||
return json.loads(cleaned)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Strategy 3: Try to fix common issues
|
|
||||||
try:
|
|
||||||
# Remove trailing commas before } or ]
|
|
||||||
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
|
||||||
# Fix unquoted keys
|
|
||||||
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
|
|
||||||
return json.loads(fixed)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Find JSON in response (may have extra text)
|
|
||||||
start = text.find('{')
|
|
||||||
end = text.rfind('}') + 1
|
|
||||||
|
|
||||||
if start == -1 or end == 0:
|
|
||||||
logger.warning("No JSON found in response")
|
|
||||||
return []
|
|
||||||
|
|
||||||
json_str = text[start:end]
|
|
||||||
data = try_parse_json(json_str)
|
|
||||||
|
|
||||||
if data is None:
|
|
||||||
# Strategy 4: Extract vocabulary entries using regex as fallback
|
|
||||||
logger.warning("JSON parsing failed, trying regex extraction")
|
|
||||||
vocabulary = []
|
|
||||||
# Match patterns like {"english": "...", "german": "...", ...}
|
|
||||||
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
|
|
||||||
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
|
|
||||||
|
|
||||||
for match in matches:
|
|
||||||
english = match[0].strip() if match[0] else ""
|
|
||||||
german = match[1].strip() if match[1] else ""
|
|
||||||
example = match[2].strip() if len(match) > 2 and match[2] else None
|
|
||||||
|
|
||||||
if english and german:
|
|
||||||
vocab_entry = VocabularyEntry(
|
|
||||||
id=str(uuid.uuid4()),
|
|
||||||
english=english,
|
|
||||||
german=german,
|
|
||||||
example_sentence=example,
|
|
||||||
)
|
|
||||||
vocabulary.append(vocab_entry)
|
|
||||||
|
|
||||||
if vocabulary:
|
|
||||||
logger.info(f"Regex extraction found {len(vocabulary)} entries")
|
|
||||||
return vocabulary
|
|
||||||
|
|
||||||
# Normal JSON parsing succeeded
|
|
||||||
vocabulary = []
|
|
||||||
for i, entry in enumerate(data.get("vocabulary", [])):
|
|
||||||
english = entry.get("english", "").strip()
|
|
||||||
german = entry.get("german", "").strip()
|
|
||||||
|
|
||||||
# Skip entries that look like hallucinations (very long or containing unusual patterns)
|
|
||||||
if len(english) > 100 or len(german) > 200:
|
|
||||||
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not english or not german:
|
|
||||||
continue
|
|
||||||
|
|
||||||
vocab_entry = VocabularyEntry(
|
|
||||||
id=str(uuid.uuid4()),
|
|
||||||
english=english,
|
|
||||||
german=german,
|
|
||||||
example_sentence=entry.get("example"),
|
|
||||||
word_type=entry.get("word_type"),
|
|
||||||
)
|
|
||||||
vocabulary.append(vocab_entry)
|
|
||||||
|
|
||||||
return vocabulary
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to parse vocabulary JSON: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.error(traceback.format_exc())
|
|
||||||
return []
|
|
||||||
|
|||||||
@@ -1,260 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/worksheet/generation.py
|
||||||
Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.generation")
|
||||||
|
|
||||||
Functions:
|
|
||||||
- generate_worksheet_html(): Build HTML for various worksheet types
|
|
||||||
- generate_worksheet_pdf(): Convert HTML to PDF via WeasyPrint
|
|
||||||
- get_pdf_page_count(): Count pages in a PDF (PyMuPDF)
|
|
||||||
- convert_pdf_page_to_image(): Render single PDF page to PNG
|
|
||||||
- convert_pdf_to_images(): Render multiple PDF pages to PNG
|
|
||||||
"""
|
|
||||||
|
|
||||||
import io
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from fastapi import HTTPException
|
|
||||||
|
|
||||||
from vocab_worksheet_models import VocabularyEntry, WorksheetType
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Optional dependency: WeasyPrint
|
|
||||||
try:
|
|
||||||
from weasyprint import HTML as _WeasyHTML
|
|
||||||
WEASYPRINT_AVAILABLE = True
|
|
||||||
except (ImportError, OSError):
|
|
||||||
WEASYPRINT_AVAILABLE = False
|
|
||||||
logger.warning("WeasyPrint not available")
|
|
||||||
|
|
||||||
# Optional dependency: PyMuPDF
|
|
||||||
try:
|
|
||||||
import fitz # PyMuPDF
|
|
||||||
FITZ_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
FITZ_AVAILABLE = False
|
|
||||||
logger.warning("PyMuPDF (fitz) not available")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Worksheet HTML Generation
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
def generate_worksheet_html(
|
|
||||||
vocabulary: List[VocabularyEntry],
|
|
||||||
worksheet_type: WorksheetType,
|
|
||||||
title: str,
|
|
||||||
show_solutions: bool = False,
|
|
||||||
repetitions: int = 3,
|
|
||||||
line_height: str = "normal"
|
|
||||||
) -> str:
|
|
||||||
"""Generate HTML for a worksheet."""
|
|
||||||
|
|
||||||
# Line height CSS
|
|
||||||
line_heights = {
|
|
||||||
"normal": "2.5em",
|
|
||||||
"large": "3.5em",
|
|
||||||
"extra-large": "4.5em"
|
|
||||||
}
|
|
||||||
lh = line_heights.get(line_height, "2.5em")
|
|
||||||
|
|
||||||
html = f"""<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<style>
|
|
||||||
@page {{ size: A4; margin: 2cm; }}
|
|
||||||
body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
|
|
||||||
h1 {{ font-size: 24px; margin-bottom: 10px; }}
|
|
||||||
.meta {{ color: #666; margin-bottom: 20px; }}
|
|
||||||
.name-line {{ margin-bottom: 30px; }}
|
|
||||||
.vocab-table {{ width: 100%; border-collapse: collapse; }}
|
|
||||||
.vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
|
|
||||||
.vocab-word {{ width: 40%; font-weight: 500; }}
|
|
||||||
.vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
|
|
||||||
.vocab-answer {{ width: 60%; color: #2563eb; }}
|
|
||||||
.gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
|
|
||||||
.hint {{ color: #666; font-style: italic; font-size: 12px; }}
|
|
||||||
.section {{ margin-top: 30px; }}
|
|
||||||
.section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h1>{title}</h1>
|
|
||||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
|
||||||
"""
|
|
||||||
|
|
||||||
if worksheet_type == WorksheetType.EN_TO_DE:
|
|
||||||
html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
|
|
||||||
html += '<table class="vocab-table">'
|
|
||||||
for entry in vocabulary:
|
|
||||||
if show_solutions:
|
|
||||||
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
|
|
||||||
else:
|
|
||||||
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
|
|
||||||
html += '</table></div>'
|
|
||||||
|
|
||||||
elif worksheet_type == WorksheetType.DE_TO_EN:
|
|
||||||
html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
|
|
||||||
html += '<table class="vocab-table">'
|
|
||||||
for entry in vocabulary:
|
|
||||||
if show_solutions:
|
|
||||||
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
|
|
||||||
else:
|
|
||||||
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
|
|
||||||
html += '</table></div>'
|
|
||||||
|
|
||||||
elif worksheet_type == WorksheetType.COPY_PRACTICE:
|
|
||||||
html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
|
|
||||||
html += '<table class="vocab-table">'
|
|
||||||
for entry in vocabulary:
|
|
||||||
html += f'<tr><td class="vocab-word">{entry.english}</td>'
|
|
||||||
html += '<td class="vocab-blank">'
|
|
||||||
if show_solutions:
|
|
||||||
html += f' {entry.english} ' * repetitions
|
|
||||||
html += '</td></tr>'
|
|
||||||
html += '</table></div>'
|
|
||||||
|
|
||||||
elif worksheet_type == WorksheetType.GAP_FILL:
|
|
||||||
entries_with_examples = [e for e in vocabulary if e.example_sentence]
|
|
||||||
if entries_with_examples:
|
|
||||||
html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
|
|
||||||
for i, entry in enumerate(entries_with_examples, 1):
|
|
||||||
# Create gap sentence by removing the English word
|
|
||||||
gap_sentence = entry.example_sentence
|
|
||||||
for word in entry.english.split():
|
|
||||||
if word.lower() in gap_sentence.lower():
|
|
||||||
gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
|
|
||||||
gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
|
|
||||||
gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
|
|
||||||
break
|
|
||||||
|
|
||||||
html += f'<p>{i}. {gap_sentence}</p>'
|
|
||||||
if show_solutions:
|
|
||||||
html += f'<p class="hint">Loesung: {entry.english}</p>'
|
|
||||||
else:
|
|
||||||
html += f'<p class="hint">({entry.german})</p>'
|
|
||||||
html += '</div>'
|
|
||||||
|
|
||||||
html += '</body></html>'
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Worksheet PDF Generation
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
async def generate_worksheet_pdf(html: str) -> bytes:
|
|
||||||
"""Generate PDF from HTML using WeasyPrint."""
|
|
||||||
try:
|
|
||||||
from weasyprint import HTML
|
|
||||||
pdf_bytes = HTML(string=html).write_pdf()
|
|
||||||
return pdf_bytes
|
|
||||||
except ImportError:
|
|
||||||
logger.warning("WeasyPrint not available, returning HTML")
|
|
||||||
return html.encode('utf-8')
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"PDF generation failed: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# PDF Utilities (PyMuPDF)
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
def get_pdf_page_count(pdf_data: bytes) -> int:
|
|
||||||
"""Get the number of pages in a PDF."""
|
|
||||||
try:
|
|
||||||
import fitz
|
|
||||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
||||||
count = pdf_document.page_count
|
|
||||||
pdf_document.close()
|
|
||||||
return count
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to get PDF page count: {e}")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
|
|
||||||
"""Convert a specific page of PDF to PNG image using PyMuPDF.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_data: PDF file as bytes
|
|
||||||
page_number: 0-indexed page number
|
|
||||||
thumbnail: If True, return a smaller thumbnail image
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import fitz # PyMuPDF
|
|
||||||
|
|
||||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
||||||
|
|
||||||
if pdf_document.page_count == 0:
|
|
||||||
raise ValueError("PDF has no pages")
|
|
||||||
|
|
||||||
if page_number >= pdf_document.page_count:
|
|
||||||
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
|
|
||||||
|
|
||||||
page = pdf_document[page_number]
|
|
||||||
|
|
||||||
# Render page to image
|
|
||||||
# For thumbnails: lower resolution, for OCR: higher resolution
|
|
||||||
zoom = 0.5 if thumbnail else 2.0
|
|
||||||
mat = fitz.Matrix(zoom, zoom)
|
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
|
|
||||||
png_data = pix.tobytes("png")
|
|
||||||
pdf_document.close()
|
|
||||||
|
|
||||||
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
|
|
||||||
return png_data
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
logger.error("PyMuPDF (fitz) not installed")
|
|
||||||
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"PDF conversion failed: {e}")
|
|
||||||
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
|
||||||
|
|
||||||
|
|
||||||
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
|
|
||||||
"""Convert multiple pages of PDF to PNG images.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_data: PDF file as bytes
|
|
||||||
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import fitz
|
|
||||||
|
|
||||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
||||||
|
|
||||||
if pdf_document.page_count == 0:
|
|
||||||
raise ValueError("PDF has no pages")
|
|
||||||
|
|
||||||
# If no pages specified, convert all
|
|
||||||
if pages is None:
|
|
||||||
pages = list(range(pdf_document.page_count))
|
|
||||||
|
|
||||||
images = []
|
|
||||||
zoom = 2.0
|
|
||||||
mat = fitz.Matrix(zoom, zoom)
|
|
||||||
|
|
||||||
for page_num in pages:
|
|
||||||
if page_num < pdf_document.page_count:
|
|
||||||
page = pdf_document[page_num]
|
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
images.append(pix.tobytes("png"))
|
|
||||||
|
|
||||||
pdf_document.close()
|
|
||||||
logger.info(f"Converted {len(images)} PDF pages to images")
|
|
||||||
return images
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
logger.error("PyMuPDF (fitz) not installed")
|
|
||||||
raise HTTPException(status_code=500, detail="PDF conversion not available")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"PDF conversion failed: {e}")
|
|
||||||
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
|
||||||
|
|||||||
@@ -1,86 +1,4 @@
|
|||||||
"""Pydantic models and enums for the Vocab Worksheet API."""
|
# Backward-compat shim -- module moved to vocab/worksheet/models.py
|
||||||
|
import importlib as _importlib
|
||||||
from datetime import datetime
|
import sys as _sys
|
||||||
from enum import Enum
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.models")
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Enums
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
class WorksheetType(str, Enum):
|
|
||||||
EN_TO_DE = "en_to_de" # English -> German translation
|
|
||||||
DE_TO_EN = "de_to_en" # German -> English translation
|
|
||||||
COPY_PRACTICE = "copy" # Write word multiple times
|
|
||||||
GAP_FILL = "gap_fill" # Fill in the blanks
|
|
||||||
COMBINED = "combined" # All types combined
|
|
||||||
|
|
||||||
|
|
||||||
class SessionStatus(str, Enum):
|
|
||||||
PENDING = "pending" # Session created, no upload yet
|
|
||||||
PROCESSING = "processing" # OCR in progress
|
|
||||||
EXTRACTED = "extracted" # Vocabulary extracted, ready to edit
|
|
||||||
COMPLETED = "completed" # Worksheet generated
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Pydantic Models
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
class VocabularyEntry(BaseModel):
|
|
||||||
id: str
|
|
||||||
english: str
|
|
||||||
german: str
|
|
||||||
example_sentence: Optional[str] = None
|
|
||||||
example_sentence_gap: Optional[str] = None # With ___ for gap-fill
|
|
||||||
word_type: Optional[str] = None # noun, verb, adjective, etc.
|
|
||||||
source_page: Optional[int] = None # Page number where entry was found (1-indexed)
|
|
||||||
|
|
||||||
|
|
||||||
class SessionCreate(BaseModel):
|
|
||||||
name: str
|
|
||||||
description: Optional[str] = None
|
|
||||||
source_language: str = "en" # Source language (default English)
|
|
||||||
target_language: str = "de" # Target language (default German)
|
|
||||||
|
|
||||||
|
|
||||||
class SessionResponse(BaseModel):
|
|
||||||
id: str
|
|
||||||
name: str
|
|
||||||
description: Optional[str]
|
|
||||||
source_language: str
|
|
||||||
target_language: str
|
|
||||||
status: str
|
|
||||||
vocabulary_count: int
|
|
||||||
image_path: Optional[str]
|
|
||||||
created_at: datetime
|
|
||||||
|
|
||||||
|
|
||||||
class VocabularyResponse(BaseModel):
|
|
||||||
session_id: str
|
|
||||||
vocabulary: List[VocabularyEntry]
|
|
||||||
extraction_confidence: Optional[float]
|
|
||||||
|
|
||||||
|
|
||||||
class VocabularyUpdate(BaseModel):
|
|
||||||
vocabulary: List[VocabularyEntry]
|
|
||||||
|
|
||||||
|
|
||||||
class WorksheetGenerateRequest(BaseModel):
|
|
||||||
worksheet_types: List[WorksheetType]
|
|
||||||
title: Optional[str] = None
|
|
||||||
include_solutions: bool = True
|
|
||||||
repetitions: int = 3 # For copy practice
|
|
||||||
line_height: str = "normal" # normal, large, extra-large
|
|
||||||
|
|
||||||
|
|
||||||
class WorksheetResponse(BaseModel):
|
|
||||||
id: str
|
|
||||||
session_id: str
|
|
||||||
worksheet_types: List[str]
|
|
||||||
pdf_path: str
|
|
||||||
solution_path: Optional[str]
|
|
||||||
generated_at: datetime
|
|
||||||
|
|||||||
@@ -1,481 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/worksheet/ocr.py
|
||||||
Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Extracted from vocab_worksheet_api.py to keep file sizes manageable.
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.ocr")
|
||||||
|
|
||||||
Pipeline steps:
|
|
||||||
orientation → deskew → dewarp → crop → scan-quality → enhance →
|
|
||||||
dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
|
|
||||||
vocab extraction → row merging
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import uuid
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Optional heavy dependencies (not available in every environment)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
except ImportError:
|
|
||||||
cv2 = None # type: ignore[assignment]
|
|
||||||
np = None # type: ignore[assignment]
|
|
||||||
logger.warning("cv2 / numpy not available — OCR pipeline disabled")
|
|
||||||
|
|
||||||
try:
|
|
||||||
from PIL import Image
|
|
||||||
except ImportError:
|
|
||||||
Image = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
try:
|
|
||||||
import pytesseract
|
|
||||||
except ImportError:
|
|
||||||
pytesseract = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
# CV pipeline helpers
|
|
||||||
try:
|
|
||||||
from cv_vocab_pipeline import (
|
|
||||||
deskew_two_pass,
|
|
||||||
dewarp_image,
|
|
||||||
detect_and_fix_orientation,
|
|
||||||
_cells_to_vocab_entries,
|
|
||||||
_fix_phonetic_brackets,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
deskew_two_pass = None # type: ignore[assignment]
|
|
||||||
dewarp_image = None # type: ignore[assignment]
|
|
||||||
detect_and_fix_orientation = None # type: ignore[assignment]
|
|
||||||
_cells_to_vocab_entries = None # type: ignore[assignment]
|
|
||||||
_fix_phonetic_brackets = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
try:
|
|
||||||
from cv_cell_grid import (
|
|
||||||
_merge_wrapped_rows,
|
|
||||||
_merge_phonetic_continuation_rows,
|
|
||||||
_merge_continuation_rows,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
_merge_wrapped_rows = None # type: ignore[assignment]
|
|
||||||
_merge_phonetic_continuation_rows = None # type: ignore[assignment]
|
|
||||||
_merge_continuation_rows = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
try:
|
|
||||||
from cv_ocr_engines import ocr_region_rapid
|
|
||||||
except ImportError:
|
|
||||||
ocr_region_rapid = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
try:
|
|
||||||
from cv_vocab_types import PageRegion
|
|
||||||
except ImportError:
|
|
||||||
PageRegion = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
try:
|
|
||||||
from ocr_pipeline_ocr_merge import (
|
|
||||||
_split_paddle_multi_words,
|
|
||||||
_merge_paddle_tesseract,
|
|
||||||
_deduplicate_words,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
_split_paddle_multi_words = None # type: ignore[assignment]
|
|
||||||
_merge_paddle_tesseract = None # type: ignore[assignment]
|
|
||||||
_deduplicate_words = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
try:
|
|
||||||
from cv_words_first import build_grid_from_words
|
|
||||||
except ImportError:
|
|
||||||
build_grid_from_words = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
try:
|
|
||||||
from ocr_pipeline_session_store import (
|
|
||||||
create_session_db as create_pipeline_session_db,
|
|
||||||
update_session_db as update_pipeline_session_db,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
create_pipeline_session_db = None # type: ignore[assignment]
|
|
||||||
update_pipeline_session_db = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Main pipeline function
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
async def _run_ocr_pipeline_for_page(
|
|
||||||
img_bgr: "np.ndarray",
|
|
||||||
page_number: int,
|
|
||||||
vocab_session_id: str,
|
|
||||||
*,
|
|
||||||
ipa_mode: str = "none",
|
|
||||||
syllable_mode: str = "none",
|
|
||||||
enable_enhance: bool = True,
|
|
||||||
max_columns: Optional[int] = 3,
|
|
||||||
override_min_conf: Optional[int] = None,
|
|
||||||
) -> tuple:
|
|
||||||
"""Run the full Kombi OCR pipeline on a single page and return vocab entries.
|
|
||||||
|
|
||||||
Uses the same pipeline as the admin OCR Kombi pipeline:
|
|
||||||
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
|
|
||||||
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
img_bgr: BGR numpy array.
|
|
||||||
page_number: 0-indexed page number.
|
|
||||||
vocab_session_id: Vocab session ID for logging.
|
|
||||||
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
|
||||||
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
|
||||||
|
|
||||||
Returns (entries, rotation_deg) where entries is a list of dicts and
|
|
||||||
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
|
||||||
"""
|
|
||||||
import time as _time
|
|
||||||
|
|
||||||
t_total = _time.time()
|
|
||||||
img_h, img_w = img_bgr.shape[:2]
|
|
||||||
logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
|
||||||
|
|
||||||
# 1. Orientation detection (fix upside-down scans)
|
|
||||||
t0 = _time.time()
|
|
||||||
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
|
|
||||||
if rotation:
|
|
||||||
img_h, img_w = img_bgr.shape[:2]
|
|
||||||
logger.info(f" orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
|
|
||||||
else:
|
|
||||||
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# 2. Create pipeline session in DB (visible in admin Kombi UI)
|
|
||||||
pipeline_session_id = str(uuid.uuid4())
|
|
||||||
try:
|
|
||||||
_, png_buf = cv2.imencode(".png", img_bgr)
|
|
||||||
original_png = png_buf.tobytes()
|
|
||||||
await create_pipeline_session_db(
|
|
||||||
pipeline_session_id,
|
|
||||||
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
|
|
||||||
filename=f"page_{page_number + 1}.png",
|
|
||||||
original_png=original_png,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Could not create pipeline session in DB: {e}")
|
|
||||||
|
|
||||||
# 3. Three-pass deskew
|
|
||||||
t0 = _time.time()
|
|
||||||
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
|
|
||||||
logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# 4. Dewarp
|
|
||||||
t0 = _time.time()
|
|
||||||
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
|
|
||||||
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# 5. Content crop (removes scanner borders, gutter shadows)
|
|
||||||
t0 = _time.time()
|
|
||||||
try:
|
|
||||||
from page_crop import detect_and_crop_page
|
|
||||||
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
|
|
||||||
if crop_result.get("crop_applied"):
|
|
||||||
dewarped_bgr = cropped_bgr
|
|
||||||
logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
|
|
||||||
else:
|
|
||||||
logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
|
|
||||||
|
|
||||||
# 5b. Scan quality assessment
|
|
||||||
scan_quality_report = None
|
|
||||||
try:
|
|
||||||
from scan_quality import score_scan_quality
|
|
||||||
scan_quality_report = score_scan_quality(dewarped_bgr)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" scan quality: failed ({e})")
|
|
||||||
|
|
||||||
if override_min_conf:
|
|
||||||
min_ocr_conf = override_min_conf
|
|
||||||
else:
|
|
||||||
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
|
|
||||||
|
|
||||||
# 5c. Image enhancement for degraded scans
|
|
||||||
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
|
|
||||||
if is_degraded and enable_enhance:
|
|
||||||
try:
|
|
||||||
from ocr_image_enhance import enhance_for_ocr
|
|
||||||
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
|
|
||||||
logger.info(" enhancement: applied (degraded scan)")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" enhancement: failed ({e})")
|
|
||||||
|
|
||||||
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
|
|
||||||
t0 = _time.time()
|
|
||||||
img_h, img_w = dewarped_bgr.shape[:2]
|
|
||||||
|
|
||||||
# RapidOCR (local ONNX)
|
|
||||||
try:
|
|
||||||
from cv_ocr_engines import ocr_region_rapid
|
|
||||||
from cv_vocab_types import PageRegion
|
|
||||||
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
|
|
||||||
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" RapidOCR failed: {e}")
|
|
||||||
rapid_words = []
|
|
||||||
|
|
||||||
# Tesseract
|
|
||||||
from PIL import Image
|
|
||||||
import pytesseract
|
|
||||||
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
|
|
||||||
data = pytesseract.image_to_data(
|
|
||||||
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
|
|
||||||
output_type=pytesseract.Output.DICT,
|
|
||||||
)
|
|
||||||
tess_words = []
|
|
||||||
for i in range(len(data["text"])):
|
|
||||||
text = str(data["text"][i]).strip()
|
|
||||||
conf_raw = str(data["conf"][i])
|
|
||||||
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
|
||||||
if not text or conf < min_ocr_conf:
|
|
||||||
continue
|
|
||||||
tess_words.append({
|
|
||||||
"text": text,
|
|
||||||
"left": data["left"][i], "top": data["top"][i],
|
|
||||||
"width": data["width"][i], "height": data["height"][i],
|
|
||||||
"conf": conf,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Merge dual-engine results
|
|
||||||
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
|
|
||||||
from cv_words_first import build_grid_from_words
|
|
||||||
|
|
||||||
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
|
|
||||||
if rapid_split or tess_words:
|
|
||||||
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
|
|
||||||
merged_words = _deduplicate_words(merged_words)
|
|
||||||
else:
|
|
||||||
merged_words = tess_words # fallback to Tesseract only
|
|
||||||
|
|
||||||
# Build initial grid from merged words
|
|
||||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
|
|
||||||
for cell in cells:
|
|
||||||
cell["ocr_engine"] = "rapid_kombi"
|
|
||||||
|
|
||||||
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
|
||||||
n_cols = len(columns_meta)
|
|
||||||
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
|
|
||||||
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# 7. Save word_result to pipeline session (needed by _build_grid_core)
|
|
||||||
word_result = {
|
|
||||||
"cells": cells,
|
|
||||||
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
|
||||||
"columns_used": columns_meta,
|
|
||||||
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
|
|
||||||
"image_width": img_w,
|
|
||||||
"image_height": img_h,
|
|
||||||
"duration_seconds": 0,
|
|
||||||
"ocr_engine": "rapid_kombi",
|
|
||||||
"raw_tesseract_words": tess_words,
|
|
||||||
"summary": {
|
|
||||||
"total_cells": len(cells),
|
|
||||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# Save images + word_result to pipeline session for admin visibility
|
|
||||||
try:
|
|
||||||
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
|
|
||||||
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
|
|
||||||
await update_pipeline_session_db(
|
|
||||||
pipeline_session_id,
|
|
||||||
deskewed_png=dsk_buf.tobytes(),
|
|
||||||
dewarped_png=dwp_buf.tobytes(),
|
|
||||||
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
|
|
||||||
word_result=word_result,
|
|
||||||
deskew_result={"angle_applied": round(angle_applied, 3)},
|
|
||||||
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
|
|
||||||
current_step=8,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Could not update pipeline session: {e}")
|
|
||||||
|
|
||||||
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
|
|
||||||
t0 = _time.time()
|
|
||||||
try:
|
|
||||||
from grid_editor_api import _build_grid_core
|
|
||||||
session_data = {
|
|
||||||
"word_result": word_result,
|
|
||||||
}
|
|
||||||
grid_result = await _build_grid_core(
|
|
||||||
pipeline_session_id, session_data,
|
|
||||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
|
||||||
)
|
|
||||||
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
|
|
||||||
f"({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# Save grid result to pipeline session
|
|
||||||
try:
|
|
||||||
await update_pipeline_session_db(
|
|
||||||
pipeline_session_id,
|
|
||||||
grid_editor_result=grid_result,
|
|
||||||
current_step=11,
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
|
||||||
grid_result = None
|
|
||||||
|
|
||||||
# 9. Extract vocab entries
|
|
||||||
# Prefer grid-build result (better column detection, more cells) over
|
|
||||||
# the initial build_grid_from_words() which often under-clusters.
|
|
||||||
page_vocabulary = []
|
|
||||||
extraction_source = "none"
|
|
||||||
|
|
||||||
# A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
|
|
||||||
if grid_result and grid_result.get("zones"):
|
|
||||||
for zone in grid_result["zones"]:
|
|
||||||
zone_cols = zone.get("columns", [])
|
|
||||||
zone_cells = zone.get("cells", [])
|
|
||||||
if not zone_cols or not zone_cells:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Sort columns by x position to determine roles
|
|
||||||
sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
|
|
||||||
col_idx_to_pos = {}
|
|
||||||
for pos, col in enumerate(sorted_cols):
|
|
||||||
ci = col.get("col_index", col.get("index", -1))
|
|
||||||
col_idx_to_pos[ci] = pos
|
|
||||||
|
|
||||||
# Skip zones with only 1 column (likely headers/boxes)
|
|
||||||
if len(sorted_cols) < 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Group cells by row
|
|
||||||
rows_map: dict = {}
|
|
||||||
for cell in zone_cells:
|
|
||||||
ri = cell.get("row_index", 0)
|
|
||||||
if ri not in rows_map:
|
|
||||||
rows_map[ri] = {}
|
|
||||||
ci = cell.get("col_index", 0)
|
|
||||||
rows_map[ri][ci] = (cell.get("text") or "").strip()
|
|
||||||
|
|
||||||
n_cols = len(sorted_cols)
|
|
||||||
for ri in sorted(rows_map.keys()):
|
|
||||||
row = rows_map[ri]
|
|
||||||
# Collect texts in column-position order
|
|
||||||
texts = []
|
|
||||||
for col in sorted_cols:
|
|
||||||
ci = col.get("col_index", col.get("index", -1))
|
|
||||||
texts.append(row.get(ci, ""))
|
|
||||||
|
|
||||||
if not any(texts):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Map by position, skipping narrow first column (page refs/markers)
|
|
||||||
# Heuristic: if first column is very narrow (<15% of zone width),
|
|
||||||
# it's likely a marker/ref column — skip it for vocab
|
|
||||||
first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
|
|
||||||
zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
|
|
||||||
skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
|
|
||||||
|
|
||||||
data_texts = texts[1:] if skip_first else texts
|
|
||||||
|
|
||||||
entry = {
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"english": data_texts[0] if len(data_texts) > 0 else "",
|
|
||||||
"german": data_texts[1] if len(data_texts) > 1 else "",
|
|
||||||
"example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
|
|
||||||
"source_page": page_number + 1,
|
|
||||||
}
|
|
||||||
if entry["english"] or entry["german"]:
|
|
||||||
page_vocabulary.append(entry)
|
|
||||||
|
|
||||||
if page_vocabulary:
|
|
||||||
extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
|
|
||||||
|
|
||||||
# B) Fallback: original cells with column classification
|
|
||||||
if not page_vocabulary:
|
|
||||||
col_types = {c.get("type") for c in columns_meta}
|
|
||||||
is_vocab = bool(col_types & {"column_en", "column_de"})
|
|
||||||
|
|
||||||
if is_vocab:
|
|
||||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
||||||
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
|
||||||
for entry in entries:
|
|
||||||
if not entry.get("english") and not entry.get("german"):
|
|
||||||
continue
|
|
||||||
page_vocabulary.append({
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"english": entry.get("english", ""),
|
|
||||||
"german": entry.get("german", ""),
|
|
||||||
"example_sentence": entry.get("example", ""),
|
|
||||||
"source_page": page_number + 1,
|
|
||||||
})
|
|
||||||
extraction_source = f"classified ({len(columns_meta)} cols)"
|
|
||||||
else:
|
|
||||||
# Last resort: all cells by position
|
|
||||||
rows_map2: dict = {}
|
|
||||||
for cell in cells:
|
|
||||||
ri = cell.get("row_index", 0)
|
|
||||||
if ri not in rows_map2:
|
|
||||||
rows_map2[ri] = {}
|
|
||||||
ci = cell.get("col_index", 0)
|
|
||||||
rows_map2[ri][ci] = (cell.get("text") or "").strip()
|
|
||||||
all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
|
|
||||||
for ri in sorted(rows_map2.keys()):
|
|
||||||
row = rows_map2[ri]
|
|
||||||
texts = [row.get(ci, "") for ci in all_ci]
|
|
||||||
if not any(texts):
|
|
||||||
continue
|
|
||||||
page_vocabulary.append({
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"english": texts[0] if len(texts) > 0 else "",
|
|
||||||
"german": texts[1] if len(texts) > 1 else "",
|
|
||||||
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
|
|
||||||
"source_page": page_number + 1,
|
|
||||||
})
|
|
||||||
extraction_source = f"generic ({len(all_ci)} cols)"
|
|
||||||
|
|
||||||
# --- Post-processing: merge cell-wrap continuation rows ---
|
|
||||||
if len(page_vocabulary) >= 2:
|
|
||||||
try:
|
|
||||||
# Convert to internal format (example_sentence → example)
|
|
||||||
internal = []
|
|
||||||
for v in page_vocabulary:
|
|
||||||
internal.append({
|
|
||||||
'row_index': len(internal),
|
|
||||||
'english': v.get('english', ''),
|
|
||||||
'german': v.get('german', ''),
|
|
||||||
'example': v.get('example_sentence', ''),
|
|
||||||
})
|
|
||||||
|
|
||||||
n_before = len(internal)
|
|
||||||
internal = _merge_wrapped_rows(internal)
|
|
||||||
internal = _merge_phonetic_continuation_rows(internal)
|
|
||||||
internal = _merge_continuation_rows(internal)
|
|
||||||
|
|
||||||
if len(internal) < n_before:
|
|
||||||
# Rebuild page_vocabulary from merged entries
|
|
||||||
merged_vocab = []
|
|
||||||
for entry in internal:
|
|
||||||
if not entry.get('english') and not entry.get('german'):
|
|
||||||
continue
|
|
||||||
merged_vocab.append({
|
|
||||||
'id': str(uuid.uuid4()),
|
|
||||||
'english': entry.get('english', ''),
|
|
||||||
'german': entry.get('german', ''),
|
|
||||||
'example_sentence': entry.get('example', ''),
|
|
||||||
'source_page': page_number + 1,
|
|
||||||
})
|
|
||||||
logger.info(f" row merging: {n_before} → {len(merged_vocab)} entries")
|
|
||||||
page_vocabulary = merged_vocab
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" row merging failed (non-critical): {e}")
|
|
||||||
|
|
||||||
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
|
|
||||||
|
|
||||||
total_duration = _time.time() - t_total
|
|
||||||
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
|
||||||
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
|
||||||
|
|
||||||
return page_vocabulary, rotation, scan_quality_report
|
|
||||||
|
|||||||
@@ -1,490 +1,4 @@
|
|||||||
"""
|
# Backward-compat shim -- module moved to vocab/worksheet/upload_api.py
|
||||||
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
|
import importlib as _importlib
|
||||||
|
import sys as _sys
|
||||||
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
|
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.upload_api")
|
||||||
|
|
||||||
Routes (no prefix — included into the main /api/v1/vocab router):
|
|
||||||
POST /sessions/{session_id}/upload-pdf-info
|
|
||||||
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
|
||||||
GET /sessions/{session_id}/pdf-page-image/{page_number}
|
|
||||||
POST /sessions/{session_id}/process-single-page/{page_number}
|
|
||||||
POST /sessions/{session_id}/process-pages
|
|
||||||
"""
|
|
||||||
|
|
||||||
import io
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import uuid
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
|
|
||||||
from fastapi.responses import StreamingResponse
|
|
||||||
|
|
||||||
from vocab_worksheet_models import SessionStatus, VocabularyEntry
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Local storage path
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Optional heavy dependencies
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
try:
|
|
||||||
import numpy as np
|
|
||||||
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
|
|
||||||
OCR_PIPELINE_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
np = None # type: ignore[assignment]
|
|
||||||
OCR_PIPELINE_AVAILABLE = False
|
|
||||||
logger.warning("OCR pipeline imports not available in upload module")
|
|
||||||
|
|
||||||
# Sub-module imports (already split out)
|
|
||||||
from vocab_worksheet_generation import (
|
|
||||||
convert_pdf_page_to_image,
|
|
||||||
convert_pdf_to_images,
|
|
||||||
get_pdf_page_count,
|
|
||||||
)
|
|
||||||
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
|
||||||
|
|
||||||
try:
|
|
||||||
from vocab_worksheet_ocr import _run_ocr_pipeline_for_page
|
|
||||||
except ImportError:
|
|
||||||
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
|
|
||||||
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# In-memory session store (shared with main module)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _get_sessions():
|
|
||||||
from vocab_worksheet_api import _sessions
|
|
||||||
return _sessions
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Router (no prefix — will be included into the main vocab router)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
upload_router = APIRouter()
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# POST /sessions/{session_id}/upload-pdf-info
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
|
|
||||||
async def upload_pdf_get_info(
|
|
||||||
session_id: str,
|
|
||||||
file: UploadFile = File(...),
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Upload a PDF and get page count and thumbnails for preview.
|
|
||||||
Use this before processing to let user select pages.
|
|
||||||
"""
|
|
||||||
logger.info(f"PDF info request for session {session_id}")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
|
|
||||||
# Validate file type
|
|
||||||
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
|
||||||
content_type = file.content_type or ''
|
|
||||||
|
|
||||||
if extension != 'pdf' and content_type != 'application/pdf':
|
|
||||||
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
|
|
||||||
|
|
||||||
content = await file.read()
|
|
||||||
|
|
||||||
# Save PDF temporarily
|
|
||||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
||||||
os.makedirs(session_dir, exist_ok=True)
|
|
||||||
pdf_path = os.path.join(session_dir, "source.pdf")
|
|
||||||
|
|
||||||
with open(pdf_path, 'wb') as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
# Get page count
|
|
||||||
page_count = get_pdf_page_count(content)
|
|
||||||
|
|
||||||
# Store PDF data in session for later processing
|
|
||||||
session["pdf_data"] = content
|
|
||||||
session["pdf_path"] = pdf_path
|
|
||||||
session["pdf_page_count"] = page_count
|
|
||||||
session["status"] = "pdf_uploaded"
|
|
||||||
|
|
||||||
# Detect orientation for each page so thumbnails are shown correctly
|
|
||||||
page_rotations: dict = {}
|
|
||||||
if OCR_PIPELINE_AVAILABLE:
|
|
||||||
for pg in range(page_count):
|
|
||||||
try:
|
|
||||||
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
|
|
||||||
_, rotation = detect_and_fix_orientation(img_bgr)
|
|
||||||
if rotation:
|
|
||||||
page_rotations[pg] = rotation
|
|
||||||
logger.info(f"Page {pg + 1}: orientation {rotation}°")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
|
|
||||||
session["page_rotations"] = page_rotations
|
|
||||||
|
|
||||||
return {
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_count": page_count,
|
|
||||||
"filename": file.filename,
|
|
||||||
"page_rotations": page_rotations,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
|
|
||||||
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
|
||||||
"""Get a thumbnail image of a specific PDF page.
|
|
||||||
|
|
||||||
Uses fitz for rendering so that page_rotations (from OCR orientation
|
|
||||||
detection) are applied consistently.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
|
||||||
"""
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
try:
|
|
||||||
import fitz
|
|
||||||
zoom = 2.0 if hires else 0.5
|
|
||||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
||||||
page = pdf_document[page_number]
|
|
||||||
# Apply orientation correction detected during OCR processing
|
|
||||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
|
||||||
if rot:
|
|
||||||
page.set_rotation(rot)
|
|
||||||
mat = fitz.Matrix(zoom, zoom)
|
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
png_data = pix.tobytes("png")
|
|
||||||
pdf_document.close()
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"PDF thumbnail failed: {e}")
|
|
||||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
io.BytesIO(png_data),
|
|
||||||
media_type="image/png",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# GET /sessions/{session_id}/pdf-page-image/{page_number}
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
|
|
||||||
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
|
|
||||||
"""PDF page as PNG at arbitrary resolution (for editor view).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
|
|
||||||
"""
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
page_count = session.get("pdf_page_count", 1)
|
|
||||||
if page_number < 0 or page_number >= page_count:
|
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
||||||
|
|
||||||
try:
|
|
||||||
import fitz
|
|
||||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
||||||
page = pdf_document[page_number]
|
|
||||||
# Apply orientation correction detected during OCR processing
|
|
||||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
|
||||||
if rot:
|
|
||||||
page.set_rotation(rot)
|
|
||||||
mat = fitz.Matrix(zoom, zoom)
|
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
png_data = pix.tobytes("png")
|
|
||||||
pdf_document.close()
|
|
||||||
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"PDF page image failed: {e}")
|
|
||||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
io.BytesIO(png_data),
|
|
||||||
media_type="image/png",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# POST /sessions/{session_id}/process-single-page/{page_number}
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
|
|
||||||
async def process_single_page(
|
|
||||||
session_id: str,
|
|
||||||
page_number: int,
|
|
||||||
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
|
||||||
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
|
||||||
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
|
|
||||||
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
|
|
||||||
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
|
|
||||||
|
|
||||||
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
|
|
||||||
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
|
|
||||||
|
|
||||||
Query params:
|
|
||||||
ipa_mode: "none" (default), "auto", "all", "en", "de"
|
|
||||||
syllable_mode: "none" (default), "auto", "all", "en", "de"
|
|
||||||
enhance: true (default) -- apply CLAHE/denoise for degraded scans
|
|
||||||
max_cols: 3 (default) -- max column count (0=unlimited)
|
|
||||||
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
|
|
||||||
|
|
||||||
The frontend should call this sequentially for each page.
|
|
||||||
Returns the vocabulary for just this one page.
|
|
||||||
"""
|
|
||||||
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=404,
|
|
||||||
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
|
|
||||||
)
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
page_count = session.get("pdf_page_count", 1)
|
|
||||||
|
|
||||||
if page_number < 0 or page_number >= page_count:
|
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
||||||
|
|
||||||
# Derive pipeline-level variable names for the quality report
|
|
||||||
enable_enhance = enhance
|
|
||||||
max_columns = max_cols if max_cols > 0 else None
|
|
||||||
override_min_conf = min_conf if min_conf > 0 else None
|
|
||||||
|
|
||||||
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
|
||||||
rotation_deg = 0
|
|
||||||
quality_report = None
|
|
||||||
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
|
|
||||||
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
|
|
||||||
try:
|
|
||||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
|
||||||
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
|
|
||||||
img_bgr, page_number, session_id,
|
|
||||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
|
||||||
enable_enhance=enable_enhance,
|
|
||||||
max_columns=max_columns,
|
|
||||||
override_min_conf=override_min_conf,
|
|
||||||
)
|
|
||||||
# Update min_ocr_conf from quality report if available
|
|
||||||
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
|
|
||||||
min_ocr_conf = quality_report.recommended_min_conf
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
|
||||||
return {
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_number": page_number + 1,
|
|
||||||
"success": False,
|
|
||||||
"error": f"OCR pipeline error: {e}",
|
|
||||||
"vocabulary": [],
|
|
||||||
"vocabulary_count": 0,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# Fallback to LLM vision extraction
|
|
||||||
logger.warning("OCR pipeline not available, falling back to LLM vision")
|
|
||||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
||||||
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
|
||||||
image_data,
|
|
||||||
f"page_{page_number + 1}.png",
|
|
||||||
page_number=page_number
|
|
||||||
)
|
|
||||||
if error:
|
|
||||||
logger.warning(f"Page {page_number + 1} failed: {error}")
|
|
||||||
return {
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_number": page_number + 1,
|
|
||||||
"success": False,
|
|
||||||
"error": error,
|
|
||||||
"vocabulary": [],
|
|
||||||
"vocabulary_count": 0,
|
|
||||||
}
|
|
||||||
page_vocabulary = []
|
|
||||||
for entry in vocabulary:
|
|
||||||
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
|
||||||
entry_dict['source_page'] = page_number + 1
|
|
||||||
if 'id' not in entry_dict or not entry_dict['id']:
|
|
||||||
entry_dict['id'] = str(uuid.uuid4())
|
|
||||||
page_vocabulary.append(entry_dict)
|
|
||||||
|
|
||||||
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
|
||||||
|
|
||||||
# Store rotation for this page (used by image/thumbnail endpoints)
|
|
||||||
session.setdefault("page_rotations", {})[page_number] = rotation_deg
|
|
||||||
|
|
||||||
# Add to session's vocabulary (append, don't replace)
|
|
||||||
existing_vocab = session.get("vocabulary", [])
|
|
||||||
# Remove any existing entries from this page (in case of re-processing)
|
|
||||||
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
|
|
||||||
existing_vocab.extend(page_vocabulary)
|
|
||||||
session["vocabulary"] = existing_vocab
|
|
||||||
session["vocabulary_count"] = len(existing_vocab)
|
|
||||||
session["status"] = SessionStatus.EXTRACTED.value
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"session_id": session_id,
|
|
||||||
"page_number": page_number + 1,
|
|
||||||
"success": True,
|
|
||||||
"vocabulary": page_vocabulary,
|
|
||||||
"vocabulary_count": len(page_vocabulary),
|
|
||||||
"total_vocabulary_count": len(existing_vocab),
|
|
||||||
"extraction_confidence": 0.9,
|
|
||||||
"rotation": rotation_deg,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add scan quality report + active steps info
|
|
||||||
if quality_report:
|
|
||||||
sq = quality_report.to_dict()
|
|
||||||
sq["active_steps"] = {
|
|
||||||
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
|
|
||||||
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
|
|
||||||
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
|
|
||||||
}
|
|
||||||
result["scan_quality"] = sq
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# POST /sessions/{session_id}/process-pages (DEPRECATED)
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
@upload_router.post("/sessions/{session_id}/process-pages")
|
|
||||||
async def process_pdf_pages(
|
|
||||||
session_id: str,
|
|
||||||
pages: List[int] = None,
|
|
||||||
process_all: bool = False,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Process specific pages of an uploaded PDF.
|
|
||||||
|
|
||||||
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pages: List of 0-indexed page numbers to process
|
|
||||||
process_all: If True, process all pages
|
|
||||||
"""
|
|
||||||
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
|
|
||||||
|
|
||||||
if session_id not in _get_sessions():
|
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
|
||||||
|
|
||||||
session = _get_sessions()[session_id]
|
|
||||||
pdf_data = session.get("pdf_data")
|
|
||||||
|
|
||||||
if not pdf_data:
|
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
||||||
|
|
||||||
page_count = session.get("pdf_page_count", 1)
|
|
||||||
|
|
||||||
# Determine which pages to process
|
|
||||||
if process_all:
|
|
||||||
pages = list(range(page_count))
|
|
||||||
elif pages is None or len(pages) == 0:
|
|
||||||
pages = [0] # Default to first page
|
|
||||||
|
|
||||||
# Convert selected pages to images
|
|
||||||
images = await convert_pdf_to_images(pdf_data, pages)
|
|
||||||
|
|
||||||
# Extract vocabulary from each page SEQUENTIALLY
|
|
||||||
all_vocabulary = []
|
|
||||||
total_confidence = 0.0
|
|
||||||
successful_pages = []
|
|
||||||
failed_pages = []
|
|
||||||
error_messages = []
|
|
||||||
|
|
||||||
for i, image_data in enumerate(images):
|
|
||||||
page_num = pages[i]
|
|
||||||
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
|
|
||||||
|
|
||||||
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
|
||||||
image_data,
|
|
||||||
f"page_{page_num + 1}.png",
|
|
||||||
page_number=page_num
|
|
||||||
)
|
|
||||||
|
|
||||||
if error:
|
|
||||||
failed_pages.append(page_num + 1)
|
|
||||||
error_messages.append(error)
|
|
||||||
logger.warning(f"Page {page_num + 1} failed: {error}")
|
|
||||||
else:
|
|
||||||
successful_pages.append(page_num + 1)
|
|
||||||
total_confidence += confidence
|
|
||||||
|
|
||||||
# Add page info to each entry and convert to dict
|
|
||||||
for entry in vocabulary:
|
|
||||||
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
|
||||||
entry_dict['source_page'] = page_num + 1
|
|
||||||
all_vocabulary.append(entry_dict)
|
|
||||||
|
|
||||||
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
|
|
||||||
|
|
||||||
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
|
|
||||||
|
|
||||||
# Update session
|
|
||||||
session["vocabulary"] = all_vocabulary
|
|
||||||
session["vocabulary_count"] = len(all_vocabulary)
|
|
||||||
session["extraction_confidence"] = avg_confidence
|
|
||||||
session["processed_pages"] = pages
|
|
||||||
session["successful_pages"] = successful_pages
|
|
||||||
session["failed_pages"] = failed_pages
|
|
||||||
session["status"] = SessionStatus.EXTRACTED.value
|
|
||||||
|
|
||||||
# Save first page as preview image
|
|
||||||
if images:
|
|
||||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
||||||
image_path = os.path.join(session_dir, "source.png")
|
|
||||||
with open(image_path, 'wb') as f:
|
|
||||||
f.write(images[0])
|
|
||||||
session["image_path"] = image_path
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"session_id": session_id,
|
|
||||||
"pages_processed": len(pages),
|
|
||||||
"pages_successful": len(successful_pages),
|
|
||||||
"pages_failed": len(failed_pages),
|
|
||||||
"successful_pages": successful_pages,
|
|
||||||
"failed_pages": failed_pages,
|
|
||||||
"vocabulary_count": len(all_vocabulary),
|
|
||||||
"extraction_confidence": avg_confidence,
|
|
||||||
"status": SessionStatus.EXTRACTED.value,
|
|
||||||
}
|
|
||||||
|
|
||||||
if error_messages:
|
|
||||||
result["errors"] = error_messages
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
Reference in New Issue
Block a user