feat: ImageLayoutEditor, arrow-key nav, multi-select bold, wider columns

- New ImageLayoutEditor: SVG overlay on original scan with draggable column dividers, horizontal guidelines (margins/header/footer), double-click to add columns, x-button to delete - GridTable: MIN_COL_WIDTH 40→80px for better readability - Arrow up/down keys navigate between rows in the grid editor - Ctrl+Click for multi-cell selection, Ctrl+B to toggle bold on selection - getAdjacentCell works for cells that don't exist yet (new rows/cols) - deleteColumn now merges x-boundaries correctly - Session restore fix: grid_editor_result/structure_result in session GET - Footer row 3-state cycle, auto-create cells for empty footer rows - Grid save/build/GT-mark now advance current_step=11 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 07:45:39 +01:00
parent 4e668660a7
commit 65f4ce1947
12 changed files with 1422 additions and 90 deletions
--- a/klausur-service/backend/cv_layout.py
+++ b/klausur-service/backend/cv_layout.py
@@ -2275,6 +2275,324 @@ def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
    return {k: round(v, 3) for k, v in scores.items()}


+# --- Dictionary / Wörterbuch Detection ---
+
+# Article words that appear as a dedicated column in dictionaries
+_DICT_ARTICLE_WORDS = {
+    # German articles
+    "die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
+    # English articles / infinitive marker
+    "the", "a", "an", "to",
+}
+
+
+def _score_dictionary_signals(
+    geometries: List[ColumnGeometry],
+    document_category: Optional[str] = None,
+    margin_strip_detected: bool = False,
+) -> Dict[str, Any]:
+    """Score dictionary-specific patterns across all columns.
+
+    Combines 4 independent signals to determine if the page is a dictionary:
+      1. Alphabetical ordering of words in each column
+      2. Article column detection (der/die/das, to)
+      3. First-letter uniformity (most headwords share a letter)
+      4. Decorative A-Z margin strip (detected upstream)
+
+    Args:
+        geometries: List of ColumnGeometry with words.
+        document_category: User-selected category (e.g. 'woerterbuch').
+        margin_strip_detected: Whether a decorative A-Z margin strip was found.
+
+    Returns:
+        Dict with 'is_dictionary', 'confidence', 'article_col_index',
+        'headword_col_index', and 'signals' sub-dict.
+    """
+    result: Dict[str, Any] = {
+        "is_dictionary": False,
+        "confidence": 0.0,
+        "article_col_index": None,
+        "headword_col_index": None,
+        "signals": {},
+    }
+
+    if not geometries or len(geometries) < 2:
+        return result
+
+    # --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
+    best_alpha_score = 0.0
+    best_alpha_col = -1
+    for geom in geometries:
+        texts = [
+            w["text"].strip().lower()
+            for w in sorted(geom.words, key=lambda w: w.get("top", 0))
+            if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
+        ]
+        if len(texts) < 5:
+            continue
+        # Deduplicate consecutive identical words (OCR double-reads)
+        deduped = [texts[0]]
+        for t in texts[1:]:
+            if t != deduped[-1]:
+                deduped.append(t)
+        if len(deduped) < 5:
+            continue
+        # Count consecutive pairs in alphabetical order
+        ordered_pairs = sum(
+            1 for i in range(len(deduped) - 1)
+            if deduped[i] <= deduped[i + 1]
+        )
+        alpha_score = ordered_pairs / (len(deduped) - 1)
+        if alpha_score > best_alpha_score:
+            best_alpha_score = alpha_score
+            best_alpha_col = geom.index
+
+    result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
+    result["signals"]["alphabetical_col"] = best_alpha_col
+
+    # --- Signal 2: Article detection (weight 0.25) ---
+    # Check three patterns:
+    # (a) Dedicated narrow article column (der/die/das only)
+    # (b) Inline articles: multi-word texts starting with "der X", "die X"
+    # (c) High article word frequency: many individual words ARE articles
+    #     (common when OCR splits "der Zustand" into separate word_boxes)
+    best_article_density = 0.0
+    best_article_col = -1
+    best_inline_article_ratio = 0.0
+    best_article_word_ratio = 0.0
+
+    for geom in geometries:
+        texts = [
+            w["text"].strip().lower()
+            for w in geom.words
+            if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
+        ]
+        if len(texts) < 3:
+            continue
+
+        # (a) Dedicated article column: narrow, mostly article words
+        article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
+        if geom.width_ratio <= 0.20:
+            density = article_count / len(texts)
+            if density > best_article_density:
+                best_article_density = density
+                best_article_col = geom.index
+
+        # (b) Inline articles: "der Zustand", "die Zutat", etc.
+        inline_count = sum(
+            1 for t in texts
+            if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
+        )
+        inline_ratio = inline_count / len(texts)
+        if inline_ratio > best_inline_article_ratio:
+            best_inline_article_ratio = inline_ratio
+
+        # (c) Article word frequency in any column (for OCR-split word_boxes)
+        # In dictionaries, articles appear frequently among headwords
+        # Require at least 10% articles and >= 3 article words
+        if article_count >= 3:
+            art_ratio = article_count / len(texts)
+            # Only count if column has enough non-article words too
+            # (pure article column is handled by (a))
+            non_art = len(texts) - article_count
+            if non_art >= 3 and art_ratio > best_article_word_ratio:
+                best_article_word_ratio = art_ratio
+
+    # Use the strongest signal
+    effective_article_score = max(
+        best_article_density,
+        best_inline_article_ratio,
+        best_article_word_ratio * 0.8,  # slight discount for raw word ratio
+    )
+
+    result["signals"]["article_density"] = round(best_article_density, 3)
+    result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
+    result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
+    result["signals"]["article_col"] = best_article_col
+
+    # --- Signal 3: First-letter uniformity (weight 0.25) ---
+    best_uniformity = 0.0
+    best_uniform_col = -1
+    has_letter_transition = False
+    for geom in geometries:
+        texts = [
+            w["text"].strip().lower()
+            for w in sorted(geom.words, key=lambda w: w.get("top", 0))
+            if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
+        ]
+        if len(texts) < 5:
+            continue
+        # Count first letters
+        first_letters = [t[0] for t in texts if t[0].isalpha()]
+        if not first_letters:
+            continue
+        from collections import Counter
+        letter_counts = Counter(first_letters)
+        most_common_letter, most_common_count = letter_counts.most_common(1)[0]
+        uniformity = most_common_count / len(first_letters)
+
+        # Check for orderly letter transitions (A→B or Y→Z)
+        # Group consecutive words by first letter, check if groups are in order
+        groups = []
+        current_letter = first_letters[0]
+        for fl in first_letters:
+            if fl != current_letter:
+                groups.append(current_letter)
+                current_letter = fl
+        groups.append(current_letter)
+        if len(groups) >= 2 and len(groups) <= 5:
+            # Check if groups are alphabetically ordered
+            if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
+                has_letter_transition = True
+                # Boost uniformity for orderly transitions
+                uniformity = max(uniformity, 0.70)
+
+        if uniformity > best_uniformity:
+            best_uniformity = uniformity
+            best_uniform_col = geom.index
+
+    result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
+    result["signals"]["uniform_col"] = best_uniform_col
+    result["signals"]["has_letter_transition"] = has_letter_transition
+
+    # --- Signal 4: Decorative margin strip (weight 0.15) ---
+    result["signals"]["margin_strip_detected"] = margin_strip_detected
+
+    # --- Combine signals ---
+    s1 = min(best_alpha_score, 1.0) * 0.35
+    s2 = min(effective_article_score, 1.0) * 0.25
+    s3 = min(best_uniformity, 1.0) * 0.25
+    s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
+
+    combined = s1 + s2 + s3 + s4
+
+    # Boost if user set document_category to 'woerterbuch'
+    if document_category == "woerterbuch":
+        combined = min(1.0, combined + 0.20)
+        result["signals"]["category_boost"] = True
+
+    result["confidence"] = round(combined, 3)
+
+    # Threshold: combined >= 0.40 to classify as dictionary
+    # (at least 2 strong signals or 3 moderate ones)
+    if combined >= 0.40:
+        result["is_dictionary"] = True
+        # Identify headword column: best alphabetical OR best uniform
+        if best_alpha_col >= 0 and best_alpha_score >= 0.60:
+            result["headword_col_index"] = best_alpha_col
+        elif best_uniform_col >= 0 and best_uniformity >= 0.50:
+            result["headword_col_index"] = best_uniform_col
+        if best_article_col >= 0 and best_article_density >= 0.30:
+            result["article_col_index"] = best_article_col
+        # If inline articles are strong but no dedicated column, note it
+        if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
+            result["signals"]["inline_articles_detected"] = True
+
+    logger.info(
+        "DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
+        combined, result["is_dictionary"], result["signals"],
+    )
+
+    return result
+
+
+def _classify_dictionary_columns(
+    geometries: List[ColumnGeometry],
+    dict_signals: Dict[str, Any],
+    lang_scores: List[Dict[str, float]],
+    content_h: int,
+) -> Optional[List[PageRegion]]:
+    """Classify columns for a detected dictionary page.
+
+    Assigns column_headword, column_article, column_ipa, and
+    column_de/column_en based on dictionary signals and language scores.
+
+    Returns None if classification fails.
+    """
+    if not dict_signals.get("is_dictionary"):
+        return None
+
+    regions: List[PageRegion] = []
+    assigned = set()
+    article_idx = dict_signals.get("article_col_index")
+    headword_idx = dict_signals.get("headword_col_index")
+
+    # 1. Assign article column if detected
+    if article_idx is not None:
+        for geom in geometries:
+            if geom.index == article_idx:
+                regions.append(PageRegion(
+                    type="column_article",
+                    x=geom.x, y=geom.y,
+                    width=geom.width, height=content_h,
+                    classification_confidence=round(
+                        dict_signals["signals"].get("article_density", 0.5), 2),
+                    classification_method="dictionary",
+                ))
+                assigned.add(geom.index)
+                break
+
+    # 2. Assign headword column
+    if headword_idx is not None and headword_idx not in assigned:
+        for geom in geometries:
+            if geom.index == headword_idx:
+                regions.append(PageRegion(
+                    type="column_headword",
+                    x=geom.x, y=geom.y,
+                    width=geom.width, height=content_h,
+                    classification_confidence=round(
+                        dict_signals["confidence"], 2),
+                    classification_method="dictionary",
+                ))
+                assigned.add(geom.index)
+                break
+
+    # 3. Assign remaining columns by language + content
+    remaining = [g for g in geometries if g.index not in assigned]
+    for geom in remaining:
+        ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
+
+        # Check if column contains IPA (brackets like [, /, ˈ)
+        ipa_chars = sum(
+            1 for w in geom.words
+            if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
+        )
+        ipa_ratio = ipa_chars / max(len(geom.words), 1)
+
+        if ipa_ratio > 0.25:
+            col_type = "column_ipa"
+            conf = round(min(1.0, ipa_ratio), 2)
+        elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
+            col_type = "column_de"
+            conf = round(ls["deu"], 2)
+        elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
+            col_type = "column_en"
+            conf = round(ls["eng"], 2)
+        else:
+            # Positional fallback: leftmost unassigned = EN, next = DE
+            left_unassigned = sorted(
+                [g for g in remaining if g.index not in assigned],
+                key=lambda g: g.x,
+            )
+            if geom == left_unassigned[0] if left_unassigned else None:
+                col_type = "column_en"
+            else:
+                col_type = "column_de"
+            conf = 0.4
+
+        regions.append(PageRegion(
+            type=col_type,
+            x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=conf,
+            classification_method="dictionary",
+        ))
+        assigned.add(geom.index)
+
+    regions.sort(key=lambda r: r.x)
+    return regions
+
+
 def _build_margin_regions(
    all_regions: List[PageRegion],
    left_x: int,
@@ -2418,9 +2736,12 @@ def classify_column_types(geometries: List[ColumnGeometry],
                          bottom_y: int,
                          left_x: int = 0,
                          right_x: int = 0,
-                          inv: Optional[np.ndarray] = None) -> List[PageRegion]:
+                          inv: Optional[np.ndarray] = None,
+                          document_category: Optional[str] = None,
+                          margin_strip_detected: bool = False) -> List[PageRegion]:
    """Classify column types using a 3-level fallback chain.

+    Level 0: Dictionary detection (if signals are strong enough)
    Level 1: Content-based (language + role scoring)
    Level 2: Position + language (old rules enhanced with language detection)
    Level 3: Pure position (exact old code, no regression)
@@ -2434,6 +2755,8 @@ def classify_column_types(geometries: List[ColumnGeometry],
        bottom_y: Bottom Y of content area.
        left_x: Left content bound (from _find_content_bounds).
        right_x: Right content bound (from _find_content_bounds).
+        document_category: User-selected category (e.g. 'woerterbuch').
+        margin_strip_detected: Whether a decorative A-Z margin strip was found.

    Returns:
        List of PageRegion with types, confidence, and method.
@@ -2499,6 +2822,22 @@ def classify_column_types(geometries: List[ColumnGeometry],
    logger.info(f"ClassifyColumns: role scores: "
                f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")

+    # --- Level 0: Dictionary detection ---
+    dict_signals = _score_dictionary_signals(
+        geometries,
+        document_category=document_category,
+        margin_strip_detected=margin_strip_detected,
+    )
+    if dict_signals["is_dictionary"]:
+        regions = _classify_dictionary_columns(
+            geometries, dict_signals, lang_scores, content_h,
+        )
+        if regions is not None:
+            logger.info("ClassifyColumns: Level 0 (dictionary) succeeded, confidence=%.3f",
+                        dict_signals["confidence"])
+            _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+            return _with_margins(ignore_regions + regions)
+
    # --- Level 1: Content-based classification ---
    regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
    if regions is not None: