diff --git a/backend-compliance/compliance/api/cra_assess_routes.py b/backend-compliance/compliance/api/cra_assess_routes.py index 23768e44..179acd45 100644 --- a/backend-compliance/compliance/api/cra_assess_routes.py +++ b/backend-compliance/compliance/api/cra_assess_routes.py @@ -13,6 +13,8 @@ from typing import Dict, List, Optional from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel +from sqlalchemy import text +from starlette.concurrency import run_in_threadpool from compliance.services.cra_finding_mapper import assess_findings_payload from compliance.services.cra_applicability import ( @@ -237,6 +239,10 @@ def _machinery_obligations(limit_per: int = 4) -> list: out = [] db = SessionLocal() try: + # Bound the query: on a slow/unindexed prod DB this used to hang ~30s and + # block the worker. Cap at 4s → on timeout the queries raise, we degrade + # to "no machinery obligations" (best-effort enrichment, not core). + db.execute(text("SET statement_timeout = '4000'")) svc = UseCaseControlsService(db) for sub_topic, bucket in _MACHINERY_SUBTOPICS: try: @@ -291,7 +297,7 @@ async def readiness(body: ReadinessRequest): # — keep them in their OWN section, not mixed into the Code/Process/Document # cyber buckets (machine safety != cybersecurity). if body.is_machinery or machine_integrator: - machinery = _machinery_obligations() + machinery = await run_in_threadpool(_machinery_obligations) if machinery: regulations.append("Maschinen-VO 2023/1230") machinery_guideline = [item for _bucket, item in machinery] diff --git a/backend-compliance/compliance/services/cra_datasheet_extractor.py b/backend-compliance/compliance/services/cra_datasheet_extractor.py index a136d8ec..ed8517b0 100644 --- a/backend-compliance/compliance/services/cra_datasheet_extractor.py +++ b/backend-compliance/compliance/services/cra_datasheet_extractor.py @@ -91,11 +91,33 @@ def _system_prompt() -> str: ) +def _coerce_json(raw: str): + """Tolerant JSON load: handle ```json fences / surrounding prose (some hosted + models, e.g. OVH, ignore response_format and wrap the object).""" + s = (raw or "").strip() + try: + return json.loads(s) + except (json.JSONDecodeError, TypeError): + pass + if "```" in s: + parts = s.split("```") + if len(parts) > 1: + s = parts[1].lstrip() + if s[:4].lower() == "json": + s = s[4:] + i, j = s.find("{"), s.rfind("}") + if i != -1 and j > i: + try: + return json.loads(s[i:j + 1]) + except (json.JSONDecodeError, TypeError): + return None + return None + + def parse_grenzen_json(raw: str) -> dict: """Parse the LLM response into {key: {value, source}} for known keys only.""" - try: - data = json.loads(raw) - except (json.JSONDecodeError, TypeError): + data = _coerce_json(raw) + if not isinstance(data, dict): return {} fields = data.get("fields") if isinstance(data, dict) else None if not isinstance(fields, dict): diff --git a/backend-compliance/tests/test_cra_datasheet_extractor.py b/backend-compliance/tests/test_cra_datasheet_extractor.py index b1c9d1e6..8388f4e7 100644 --- a/backend-compliance/tests/test_cra_datasheet_extractor.py +++ b/backend-compliance/tests/test_cra_datasheet_extractor.py @@ -48,6 +48,14 @@ class TestParse: assert parse_grenzen_json("not json") == {} assert parse_grenzen_json("") == {} + def test_fenced_json(self): + raw = '```json\n{"fields": {"manufacturer": {"value": "OWIS", "source": "x"}}}\n```' + assert parse_grenzen_json(raw)["manufacturer"]["value"] == "OWIS" + + def test_prose_wrapped_json(self): + raw = 'Hier das Ergebnis:\n{"fields": {"machine_type": {"value": "Steuerung"}}}\nDanke.' + assert parse_grenzen_json(raw)["machine_type"]["value"] == "Steuerung" + class TestFollowups: def test_empty_limits_asks_all_essentials(self):