From 400011050114dca3630c18fe51a0f17d778b6145 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 21 Mar 2026 18:05:31 +0100 Subject: [PATCH] fix: extend tiny symbol filter to all non-black colors, raise area to 200 Step 5i rule (a) only caught blue tiny symbols. Graphic fragments from page illustrations (e.g. orange quote mark from man illustration) were missed. Now filters any non-black colored word_box with area < 200 and confidence < 85. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 3523964..9787053 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2292,7 +2292,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # OCR reads these as text artifacts (©, e, *, or even plausible words # like "fighily" overlapping the real word "tightly"). # Detection rules: - # a) Tiny blue symbols: area < 150 AND conf < 85 + # a) Tiny coloured symbols: area < 200 AND conf < 85 (any non-black) # b) Overlapping word_boxes: >40% x-overlap → remove lower confidence # c) Duplicate text: consecutive blue wbs with identical text, gap < 6px bullet_removed = 0 @@ -2303,10 +2303,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: continue to_remove: set = set() - # Rule (a): tiny blue symbols + # Rule (a): tiny coloured symbols (bullets, graphic fragments) for i, wb in enumerate(wbs): - if (wb.get("color_name") == "blue" - and wb.get("width", 0) * wb.get("height", 0) < 150 + cn = wb.get("color_name", "black") + if (cn != "black" + and wb.get("width", 0) * wb.get("height", 0) < 200 and wb.get("conf", 100) < 85): to_remove.add(i)