diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 6facc19..241d4c1 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -855,13 +855,25 @@ def _detect_heading_rows_by_single_cell( continue last_col = col_indices[-1] - # Count content cells per row (column_* but not column_1/page_ref) + # Count content cells per row (column_* but not column_1/page_ref). + # Exception: column_1 cells that contain a dictionary article word + # (die/der/das etc.) ARE content — they appear in dictionary layouts + # where the leftmost column holds grammatical articles. + _ARTICLE_WORDS = { + "die", "der", "das", "dem", "den", "des", "ein", "eine", + "the", "a", "an", + } row_content_counts: Dict[int, int] = {} for cell in cells: ct = cell.get("col_type", "") - if ct.startswith("column_") and ct != "column_1": - ri = cell.get("row_index", -1) - row_content_counts[ri] = row_content_counts.get(ri, 0) + 1 + if not ct.startswith("column_"): + continue + if ct == "column_1": + ctext = (cell.get("text") or "").strip().lower() + if ctext not in _ARTICLE_WORDS: + continue + ri = cell.get("row_index", -1) + row_content_counts[ri] = row_content_counts.get(ri, 0) + 1 # Majority of rows must have ≥2 content cells multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2) @@ -887,7 +899,8 @@ def _detect_heading_rows_by_single_cell( content_cells = [ c for c in row_cells if c.get("col_type", "").startswith("column_") - and c.get("col_type") != "column_1" + and (c.get("col_type") != "column_1" + or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS) ] if len(content_cells) != 1: continue @@ -2483,6 +2496,55 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if (c.get("word_boxes") or c.get("text", "").strip())] logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed) + # 5j-pre. Remove cells whose text is entirely garbled / artifact noise. + # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr", + # "\\", "PEE", "a=") that survive earlier filters because their rows also + # contain real content in other columns. Remove them here. + _COMMON_SHORT_WORDS = { + # German + "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja", + "ob", "so", "um", "zu", "wo", "je", "oh", "or", + "die", "der", "das", "dem", "den", "des", "ein", "und", + "auf", "aus", "bei", "bis", "für", "mit", "nur", "von", + # English + "a", "i", "an", "as", "at", "be", "by", "do", "go", "he", + "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok", + "on", "or", "so", "to", "up", "us", "we", + "the", "and", "but", "for", "not", + } + _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$') + artifact_cells_removed = 0 + for z in zones_data: + before = len(z.get("cells", [])) + kept = [] + for cell in z.get("cells", []): + text = (cell.get("text") or "").strip() + core = text.rstrip(".,;:!?'\"") + is_artifact = False + if not core: + is_artifact = True + elif _PURE_JUNK_RE.match(core): + is_artifact = True + elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS: + is_artifact = True + elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: + is_artifact = True + elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core): + # Mixed digits + letters in short text (e.g. "7 EN", "a=3") + is_artifact = True + if is_artifact: + kept.append(None) # placeholder + else: + kept.append(cell) + z["cells"] = [c for c in kept if c is not None] + artifact_cells_removed += before - len(z["cells"]) + if artifact_cells_removed: + # Also remove rows that became completely empty + for z in zones_data: + cell_ris = {c.get("row_index") for c in z.get("cells", [])} + z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris] + logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed) + # 5j. Normalise word_box order to reading order (group by Y, sort by X). # The frontend renders colored cells from word_boxes array order # (GridTable.tsx), so they MUST be in left-to-right reading order.