fix: handle Qwen think mode in classification, add German term matching
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -121,18 +121,35 @@ async def _classify(client: httpx.AsyncClient, text: str) -> str:
|
||||
resp = await client.post(f"{SDK_URL}/sdk/v1/llm/chat", headers=SDK_HEADERS, json={
|
||||
"messages": [
|
||||
{"role": "system", "content": (
|
||||
"/no_think\n"
|
||||
"Klassifiziere das Dokument in GENAU EINE Kategorie: "
|
||||
"privacy_policy, cookie_banner, terms_of_service, imprint, dpa, other. "
|
||||
"Antworte NUR mit dem Kategorienamen, nichts anderes."
|
||||
"Antworte NUR mit dem Kategorienamen, nichts anderes. Kein Denken, keine Erklaerung."
|
||||
)},
|
||||
{"role": "user", "content": text[:2000]},
|
||||
],
|
||||
})
|
||||
data = resp.json()
|
||||
raw = data.get("response", data.get("content", "other")).strip().lower()
|
||||
# Qwen 3.5 may use think mode — content can be in message.content or response
|
||||
raw = (
|
||||
data.get("response", "")
|
||||
or data.get("content", "")
|
||||
or (data.get("message", {}) or {}).get("content", "")
|
||||
or ""
|
||||
).strip().lower()
|
||||
# Strip Qwen think tags if present
|
||||
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
||||
logger.info("Classification raw response: %s", raw[:200])
|
||||
for cat in ["privacy_policy", "cookie_banner", "terms_of_service", "imprint", "dpa"]:
|
||||
if cat in raw:
|
||||
return cat
|
||||
# Also check German terms
|
||||
if "datenschutz" in raw:
|
||||
return "privacy_policy"
|
||||
if "cookie" in raw:
|
||||
return "cookie_banner"
|
||||
if "impressum" in raw:
|
||||
return "imprint"
|
||||
return "other"
|
||||
except Exception as e:
|
||||
logger.warning("Classification failed: %s", e)
|
||||
|
||||
Reference in New Issue
Block a user