diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 7c7a53e..90b1caa 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -5562,6 +5562,10 @@ async def llm_review_entries(
for e in review_entries
]
+ logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
+ len(review_entries), len(entries), model, len(entries) - len(reviewable))
+ logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
+
prompt = _build_llm_prompt(table_lines)
t0 = time.time()
@@ -5572,6 +5576,7 @@ async def llm_review_entries(
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
+ "think": False, # qwen3: disable chain-of-thought (Ollama >=0.6)
"options": {"temperature": 0.1, "num_predict": 8192},
},
)
@@ -5579,7 +5584,11 @@ async def llm_review_entries(
content = resp.json().get("message", {}).get("content", "")
duration_ms = int((time.time() - t0) * 1000)
+ logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
+ logger.debug("LLM review raw response (first 500): %.500s", content)
+
corrected = _parse_llm_json_array(content)
+ logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
changes, corrected_entries = _diff_batch(review_entries, corrected)
# Merge corrected entries back into the full list
@@ -5696,15 +5705,19 @@ async def llm_review_entries_streaming(
def _parse_llm_json_array(text: str) -> List[Dict]:
- """Extract JSON array from LLM response (may contain markdown fences)."""
+ """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
+ # Strip qwen3 ... blocks (present even with think=False on some builds)
+ text = _re.sub(r'.*?', '', text, flags=_re.DOTALL)
# Strip markdown code fences
text = _re.sub(r'```json\s*', '', text)
text = _re.sub(r'```\s*', '', text)
- # Find array
+ # Find first [ ... last ] (non-greedy would miss nested structures, greedy is correct here)
match = _re.search(r'\[.*\]', text, _re.DOTALL)
if match:
try:
return _json.loads(match.group())
- except (ValueError, _json.JSONDecodeError):
- pass
+ except (ValueError, _json.JSONDecodeError) as e:
+ logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
+ else:
+ logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
return []