feat(ocr-pipeline): add page-split endpoint for double-page book spreads

Each page of a double-page scan tilts differently due to the book spine. The new POST /page-split endpoint detects spreads after orientation and creates sub-sessions that go through the full pipeline (deskew, dewarp, crop, etc.) individually, so each page gets its own deskew correction. Also fixes border-strip filter incorrectly removing German translation words by adding a decorative-strip validation check. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 10:53:06 +01:00
parent 2a21127f01
commit 40815dafd1
2 changed files with 209 additions and 3 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -83,11 +83,25 @@ def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
            right_count = total - gi
            break

+    # Validate candidate strip: real border decorations are mostly short
+    # single-character words (alphabet letters, stray marks).  Multi-word
+    # content like "der Ranzen" or "die Schals" (continuation of German
+    # translations) must NOT be removed.
+    def _is_decorative_strip(candidates: List[Dict]) -> bool:
+        if not candidates:
+            return False
+        short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
+        return short / len(candidates) >= 0.5
+
    strip_ids: set = set()
    if left_count > 0 and left_count / total < 0.20:
-        strip_ids = {id(w) for w in sorted_words[:left_count]}
+        candidates = sorted_words[:left_count]
+        if _is_decorative_strip(candidates):
+            strip_ids = {id(w) for w in candidates}
    elif right_count > 0 and right_count / total < 0.20:
-        strip_ids = {id(w) for w in sorted_words[total - right_count :]}
+        candidates = sorted_words[total - right_count:]
+        if _is_decorative_strip(candidates):
+            strip_ids = {id(w) for w in candidates}

    if not strip_ids:
        return words, 0