feat: ImageLayoutEditor, arrow-key nav, multi-select bold, wider columns
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 32s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s

- New ImageLayoutEditor: SVG overlay on original scan with draggable
  column dividers, horizontal guidelines (margins/header/footer),
  double-click to add columns, x-button to delete
- GridTable: MIN_COL_WIDTH 40→80px for better readability
- Arrow up/down keys navigate between rows in the grid editor
- Ctrl+Click for multi-cell selection, Ctrl+B to toggle bold on selection
- getAdjacentCell works for cells that don't exist yet (new rows/cols)
- deleteColumn now merges x-boundaries correctly
- Session restore fix: grid_editor_result/structure_result in session GET
- Footer row 3-state cycle, auto-create cells for empty footer rows
- Grid save/build/GT-mark now advance current_step=11

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 07:45:39 +01:00
parent 4e668660a7
commit 65f4ce1947
12 changed files with 1422 additions and 90 deletions

View File

@@ -2275,6 +2275,324 @@ def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
return {k: round(v, 3) for k, v in scores.items()}
# --- Dictionary / Wörterbuch Detection ---
# Article words that appear as a dedicated column in dictionaries
_DICT_ARTICLE_WORDS = {
# German articles
"die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
# English articles / infinitive marker
"the", "a", "an", "to",
}
def _score_dictionary_signals(
geometries: List[ColumnGeometry],
document_category: Optional[str] = None,
margin_strip_detected: bool = False,
) -> Dict[str, Any]:
"""Score dictionary-specific patterns across all columns.
Combines 4 independent signals to determine if the page is a dictionary:
1. Alphabetical ordering of words in each column
2. Article column detection (der/die/das, to)
3. First-letter uniformity (most headwords share a letter)
4. Decorative A-Z margin strip (detected upstream)
Args:
geometries: List of ColumnGeometry with words.
document_category: User-selected category (e.g. 'woerterbuch').
margin_strip_detected: Whether a decorative A-Z margin strip was found.
Returns:
Dict with 'is_dictionary', 'confidence', 'article_col_index',
'headword_col_index', and 'signals' sub-dict.
"""
result: Dict[str, Any] = {
"is_dictionary": False,
"confidence": 0.0,
"article_col_index": None,
"headword_col_index": None,
"signals": {},
}
if not geometries or len(geometries) < 2:
return result
# --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
best_alpha_score = 0.0
best_alpha_col = -1
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
]
if len(texts) < 5:
continue
# Deduplicate consecutive identical words (OCR double-reads)
deduped = [texts[0]]
for t in texts[1:]:
if t != deduped[-1]:
deduped.append(t)
if len(deduped) < 5:
continue
# Count consecutive pairs in alphabetical order
ordered_pairs = sum(
1 for i in range(len(deduped) - 1)
if deduped[i] <= deduped[i + 1]
)
alpha_score = ordered_pairs / (len(deduped) - 1)
if alpha_score > best_alpha_score:
best_alpha_score = alpha_score
best_alpha_col = geom.index
result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
result["signals"]["alphabetical_col"] = best_alpha_col
# --- Signal 2: Article detection (weight 0.25) ---
# Check three patterns:
# (a) Dedicated narrow article column (der/die/das only)
# (b) Inline articles: multi-word texts starting with "der X", "die X"
# (c) High article word frequency: many individual words ARE articles
# (common when OCR splits "der Zustand" into separate word_boxes)
best_article_density = 0.0
best_article_col = -1
best_inline_article_ratio = 0.0
best_article_word_ratio = 0.0
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in geom.words
if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
]
if len(texts) < 3:
continue
# (a) Dedicated article column: narrow, mostly article words
article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
if geom.width_ratio <= 0.20:
density = article_count / len(texts)
if density > best_article_density:
best_article_density = density
best_article_col = geom.index
# (b) Inline articles: "der Zustand", "die Zutat", etc.
inline_count = sum(
1 for t in texts
if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
)
inline_ratio = inline_count / len(texts)
if inline_ratio > best_inline_article_ratio:
best_inline_article_ratio = inline_ratio
# (c) Article word frequency in any column (for OCR-split word_boxes)
# In dictionaries, articles appear frequently among headwords
# Require at least 10% articles and >= 3 article words
if article_count >= 3:
art_ratio = article_count / len(texts)
# Only count if column has enough non-article words too
# (pure article column is handled by (a))
non_art = len(texts) - article_count
if non_art >= 3 and art_ratio > best_article_word_ratio:
best_article_word_ratio = art_ratio
# Use the strongest signal
effective_article_score = max(
best_article_density,
best_inline_article_ratio,
best_article_word_ratio * 0.8, # slight discount for raw word ratio
)
result["signals"]["article_density"] = round(best_article_density, 3)
result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
result["signals"]["article_col"] = best_article_col
# --- Signal 3: First-letter uniformity (weight 0.25) ---
best_uniformity = 0.0
best_uniform_col = -1
has_letter_transition = False
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
]
if len(texts) < 5:
continue
# Count first letters
first_letters = [t[0] for t in texts if t[0].isalpha()]
if not first_letters:
continue
from collections import Counter
letter_counts = Counter(first_letters)
most_common_letter, most_common_count = letter_counts.most_common(1)[0]
uniformity = most_common_count / len(first_letters)
# Check for orderly letter transitions (A→B or Y→Z)
# Group consecutive words by first letter, check if groups are in order
groups = []
current_letter = first_letters[0]
for fl in first_letters:
if fl != current_letter:
groups.append(current_letter)
current_letter = fl
groups.append(current_letter)
if len(groups) >= 2 and len(groups) <= 5:
# Check if groups are alphabetically ordered
if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
has_letter_transition = True
# Boost uniformity for orderly transitions
uniformity = max(uniformity, 0.70)
if uniformity > best_uniformity:
best_uniformity = uniformity
best_uniform_col = geom.index
result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
result["signals"]["uniform_col"] = best_uniform_col
result["signals"]["has_letter_transition"] = has_letter_transition
# --- Signal 4: Decorative margin strip (weight 0.15) ---
result["signals"]["margin_strip_detected"] = margin_strip_detected
# --- Combine signals ---
s1 = min(best_alpha_score, 1.0) * 0.35
s2 = min(effective_article_score, 1.0) * 0.25
s3 = min(best_uniformity, 1.0) * 0.25
s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
combined = s1 + s2 + s3 + s4
# Boost if user set document_category to 'woerterbuch'
if document_category == "woerterbuch":
combined = min(1.0, combined + 0.20)
result["signals"]["category_boost"] = True
result["confidence"] = round(combined, 3)
# Threshold: combined >= 0.40 to classify as dictionary
# (at least 2 strong signals or 3 moderate ones)
if combined >= 0.40:
result["is_dictionary"] = True
# Identify headword column: best alphabetical OR best uniform
if best_alpha_col >= 0 and best_alpha_score >= 0.60:
result["headword_col_index"] = best_alpha_col
elif best_uniform_col >= 0 and best_uniformity >= 0.50:
result["headword_col_index"] = best_uniform_col
if best_article_col >= 0 and best_article_density >= 0.30:
result["article_col_index"] = best_article_col
# If inline articles are strong but no dedicated column, note it
if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
result["signals"]["inline_articles_detected"] = True
logger.info(
"DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
combined, result["is_dictionary"], result["signals"],
)
return result
def _classify_dictionary_columns(
geometries: List[ColumnGeometry],
dict_signals: Dict[str, Any],
lang_scores: List[Dict[str, float]],
content_h: int,
) -> Optional[List[PageRegion]]:
"""Classify columns for a detected dictionary page.
Assigns column_headword, column_article, column_ipa, and
column_de/column_en based on dictionary signals and language scores.
Returns None if classification fails.
"""
if not dict_signals.get("is_dictionary"):
return None
regions: List[PageRegion] = []
assigned = set()
article_idx = dict_signals.get("article_col_index")
headword_idx = dict_signals.get("headword_col_index")
# 1. Assign article column if detected
if article_idx is not None:
for geom in geometries:
if geom.index == article_idx:
regions.append(PageRegion(
type="column_article",
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(
dict_signals["signals"].get("article_density", 0.5), 2),
classification_method="dictionary",
))
assigned.add(geom.index)
break
# 2. Assign headword column
if headword_idx is not None and headword_idx not in assigned:
for geom in geometries:
if geom.index == headword_idx:
regions.append(PageRegion(
type="column_headword",
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(
dict_signals["confidence"], 2),
classification_method="dictionary",
))
assigned.add(geom.index)
break
# 3. Assign remaining columns by language + content
remaining = [g for g in geometries if g.index not in assigned]
for geom in remaining:
ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
# Check if column contains IPA (brackets like [, /, ˈ)
ipa_chars = sum(
1 for w in geom.words
if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
)
ipa_ratio = ipa_chars / max(len(geom.words), 1)
if ipa_ratio > 0.25:
col_type = "column_ipa"
conf = round(min(1.0, ipa_ratio), 2)
elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
col_type = "column_de"
conf = round(ls["deu"], 2)
elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
col_type = "column_en"
conf = round(ls["eng"], 2)
else:
# Positional fallback: leftmost unassigned = EN, next = DE
left_unassigned = sorted(
[g for g in remaining if g.index not in assigned],
key=lambda g: g.x,
)
if geom == left_unassigned[0] if left_unassigned else None:
col_type = "column_en"
else:
col_type = "column_de"
conf = 0.4
regions.append(PageRegion(
type=col_type,
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=conf,
classification_method="dictionary",
))
assigned.add(geom.index)
regions.sort(key=lambda r: r.x)
return regions
def _build_margin_regions(
all_regions: List[PageRegion],
left_x: int,
@@ -2418,9 +2736,12 @@ def classify_column_types(geometries: List[ColumnGeometry],
bottom_y: int,
left_x: int = 0,
right_x: int = 0,
inv: Optional[np.ndarray] = None) -> List[PageRegion]:
inv: Optional[np.ndarray] = None,
document_category: Optional[str] = None,
margin_strip_detected: bool = False) -> List[PageRegion]:
"""Classify column types using a 3-level fallback chain.
Level 0: Dictionary detection (if signals are strong enough)
Level 1: Content-based (language + role scoring)
Level 2: Position + language (old rules enhanced with language detection)
Level 3: Pure position (exact old code, no regression)
@@ -2434,6 +2755,8 @@ def classify_column_types(geometries: List[ColumnGeometry],
bottom_y: Bottom Y of content area.
left_x: Left content bound (from _find_content_bounds).
right_x: Right content bound (from _find_content_bounds).
document_category: User-selected category (e.g. 'woerterbuch').
margin_strip_detected: Whether a decorative A-Z margin strip was found.
Returns:
List of PageRegion with types, confidence, and method.
@@ -2499,6 +2822,22 @@ def classify_column_types(geometries: List[ColumnGeometry],
logger.info(f"ClassifyColumns: role scores: "
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
# --- Level 0: Dictionary detection ---
dict_signals = _score_dictionary_signals(
geometries,
document_category=document_category,
margin_strip_detected=margin_strip_detected,
)
if dict_signals["is_dictionary"]:
regions = _classify_dictionary_columns(
geometries, dict_signals, lang_scores, content_h,
)
if regions is not None:
logger.info("ClassifyColumns: Level 0 (dictionary) succeeded, confidence=%.3f",
dict_signals["confidence"])
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# --- Level 1: Content-based classification ---
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
if regions is not None:

View File

@@ -85,7 +85,7 @@ ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'o
@dataclass
class PageRegion:
"""A detected region on the page."""
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom', 'column_headword', 'column_article', 'column_ipa'
x: int
y: int
width: int

View File

@@ -1201,7 +1201,7 @@ def _filter_decorative_margin(
img_w: int,
log: Any,
session_id: str,
) -> None:
) -> Dict[str, Any]:
"""Remove words that belong to a decorative alphabet strip on a margin.
Some vocabulary worksheets have a vertical AZ alphabet graphic along
@@ -1220,9 +1220,13 @@ def _filter_decorative_margin(
artifacts like "Vv" that belong to the same decorative element.
Modifies *words* in place.
Returns:
Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
"""
no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
if not words or img_w <= 0:
return
return no_strip
margin_cutoff = img_w * 0.30
# Phase 1: find candidate strips using single-char words
@@ -1278,6 +1282,9 @@ def _filter_decorative_margin(
"(strip x=%d-%d)",
session_id, removed, side, strip_x_lo, strip_x_hi,
)
return {"found": True, "side": side, "letters_detected": len(strip)}
return no_strip
def _filter_footer_words(
@@ -1427,7 +1434,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# Some worksheets have a decorative alphabet strip along one margin
# (A-Z in a graphic). OCR reads these as single-char words aligned
# vertically. Detect and remove them before grid building.
_filter_decorative_margin(all_words, img_w, logger, session_id)
margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
margin_strip_detected = margin_strip_info.get("found", False)
# Read document_category from session (user-selected or auto-detected)
document_category = session.get("document_category")
# 2c. Filter footer rows (page numbers at the very bottom).
# Isolated short text in the bottom 5% of the page is typically a
@@ -1997,18 +2008,21 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
removed_pipes, z.get("zone_index", 0),
)
# Also strip pipe chars from word_box text and cell text that may remain
# from OCR reading syllable-separation marks (e.g. "zu|trau|en" → "zutrauen").
# Strip pipe chars ONLY from word_boxes/cells where the pipe is an
# OCR column-divider artifact. Preserve pipes that are embedded in
# words as syllable separators (e.g. "zu|trau|en") — these are
# intentional and used in dictionary Ground Truth.
for z in zones_data:
for cell in z.get("cells", []):
for wb in cell.get("word_boxes", []):
wbt = wb.get("text", "")
if "|" in wbt:
wb["text"] = wbt.replace("|", "")
# Only strip if the ENTIRE word_box is just pipe(s)
# (handled by _PIPE_RE above) — leave embedded pipes alone
text = cell.get("text", "")
if "|" in text:
cleaned = text.replace("|", "").strip()
if cleaned != text:
# Only strip leading/trailing pipes (OCR artifacts at cell edges)
cleaned = text.strip("|").strip()
if cleaned != text.strip():
cell["text"] = cleaned
# 4e. Detect and remove page-border decoration strips.
@@ -2668,6 +2682,63 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
)
font_size_suggestion = max(10, int(avg_row_height * 0.6))
# --- Dictionary detection on assembled grid ---
# Build lightweight ColumnGeometry-like structures from zone columns for
# dictionary signal scoring.
from cv_layout import _score_dictionary_signals
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
try:
from cv_vocab_types import ColumnGeometry
for z in zones_data:
zone_cells = z.get("cells", [])
zone_cols = z.get("columns", [])
if len(zone_cols) < 2 or len(zone_cells) < 10:
continue
# Build pseudo-ColumnGeometry per column
pseudo_geoms = []
for col in zone_cols:
ci = col["index"]
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
# Flatten word_boxes into word dicts compatible with _score_language
col_words = []
for cell in col_cells:
for wb in cell.get("word_boxes") or []:
col_words.append({
"text": wb.get("text", ""),
"conf": wb.get("conf", 0),
"top": wb.get("top", 0),
"left": wb.get("left", 0),
"height": wb.get("height", 0),
"width": wb.get("width", 0),
})
# Fallback: use cell text if no word_boxes
if not cell.get("word_boxes") and cell.get("text"):
col_words.append({
"text": cell["text"],
"conf": cell.get("confidence", 50),
"top": cell.get("bbox_px", {}).get("y", 0),
"left": cell.get("bbox_px", {}).get("x", 0),
"height": cell.get("bbox_px", {}).get("h", 20),
"width": cell.get("bbox_px", {}).get("w", 50),
})
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
pseudo_geoms.append(ColumnGeometry(
index=ci, x=col.get("x_min_px", 0), y=0,
width=max(col_w, 1), height=img_h,
word_count=len(col_words), words=col_words,
width_ratio=col_w / max(img_w, 1),
))
if len(pseudo_geoms) >= 2:
dd = _score_dictionary_signals(
pseudo_geoms,
document_category=document_category,
margin_strip_detected=margin_strip_detected,
)
if dd["confidence"] > dict_detection["confidence"]:
dict_detection = dd
except Exception as e:
logger.warning("Dictionary detection failed: %s", e)
result = {
"session_id": session_id,
"image_width": img_w,
@@ -2693,6 +2764,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"avg_row_height_px": round(avg_row_height, 1),
"font_size_suggestion_px": font_size_suggestion,
},
"dictionary_detection": {
"is_dictionary": dict_detection.get("is_dictionary", False),
"confidence": dict_detection.get("confidence", 0.0),
"signals": dict_detection.get("signals", {}),
"article_col_index": dict_detection.get("article_col_index"),
"headword_col_index": dict_detection.get("headword_col_index"),
},
"duration_seconds": round(duration, 2),
}
@@ -2722,8 +2800,8 @@ async def build_grid(session_id: str):
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
# Persist to DB
await update_session_db(session_id, grid_editor_result=result)
# Persist to DB and advance current_step to 11 (reconstruction complete)
await update_session_db(session_id, grid_editor_result=result, current_step=11)
logger.info(
"build-grid session %s: %d zones, %d cols, %d rows, %d cells, "
@@ -2772,7 +2850,7 @@ async def save_grid(session_id: str, request: Request):
"edited": True,
}
await update_session_db(session_id, grid_editor_result=result)
await update_session_db(session_id, grid_editor_result=result, current_step=11)
logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"]))

View File

@@ -256,7 +256,7 @@ async def mark_ground_truth(
# Merge into existing ground_truth JSONB
gt = session.get("ground_truth") or {}
gt["build_grid_reference"] = reference
await update_session_db(session_id, ground_truth=gt)
await update_session_db(session_id, ground_truth=gt, current_step=11)
logger.info(
"Ground truth marked for session %s: %d cells",

View File

@@ -178,6 +178,18 @@ async def get_session_info(session_id: str):
result["word_result"] = session["word_result"]
if session.get("doc_type_result"):
result["doc_type_result"] = session["doc_type_result"]
if session.get("structure_result"):
result["structure_result"] = session["structure_result"]
if session.get("grid_editor_result"):
# Include summary only to keep response small
gr = session["grid_editor_result"]
result["grid_editor_result"] = {
"summary": gr.get("summary", {}),
"zones_count": len(gr.get("zones", [])),
"edited": gr.get("edited", False),
}
if session.get("ground_truth"):
result["ground_truth"] = session["ground_truth"]
# Sub-session info
if session.get("parent_session_id"):