feat(cra): Datenblatt→Grenzen-Extraktor (hybrid, lokales 35B)

Hybrid-Extraktion Datenblatt → IACE Grenzen (ISO 12100): deterministischer
Detektor (Schnittstellen/Einheiten per Regex) + lokales 35B via llm_cascade
(Qwen-lokal-first) fuer die semantische Zuordnung auf die echten LimitsFormData-
Keys. Nichts erfinden: Feld nicht im Text → leer + Quellen-Zitat je Feld.
Essenzielle ISO-12100-Felder, die leer bleiben → gezielte Rückfragen
(foreseeable_misuses, person_groups, qualification, temporal_limits …).
Endpoint POST /api/v1/cra/extract-datasheet. 13 Tests gruen (reine Teile).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-16 19:06:07 +02:00
parent 62fafaaec5
commit cfdc5fe277
3 changed files with 268 additions and 0 deletions
@@ -0,0 +1,78 @@
"""Datasheet -> Grenzen extraction (deterministic + parser parts)."""
from compliance.services.cra_datasheet_extractor import (
detect_signals, parse_grenzen_json, compute_followups, _merge_detected, _ESSENTIAL,
)
OWIS = (
"PS 90+ Universelle Positioniersteuerung, bis 9 Achsen. Schnittstellen: Ethernet, "
"USB, RS232, optional Anybus (Modbus/TCP). Versorgung 24 V. SDK fuer C/C++/C#/LabView."
)
class TestDetectSignals:
def test_interfaces_detected(self):
s = detect_signals(OWIS)
for tok in ("Ethernet", "USB", "RS232", "Modbus", "Anybus"):
assert tok in s["interfaces"], tok
def test_units_detected(self):
s = detect_signals(OWIS)
assert any("24" in u and "V" in u for u in s["units"])
def test_empty_text(self):
s = detect_signals("")
assert s["interfaces"] == [] and s["units"] == []
def test_no_duplicate_rs232_variants(self):
s = detect_signals("RS232 and RS-232 ports")
rs = [i for i in s["interfaces"] if i.lower().startswith("rs")]
assert len(rs) == 1
class TestParse:
def test_parses_fields_wrapper(self):
raw = '{"fields": {"machine_designation": {"value": "PS 90+", "source": "PS 90+ ..."}, "intended_purpose": {"value": "", "source": ""}}}'
out = parse_grenzen_json(raw)
assert out["machine_designation"]["value"] == "PS 90+"
assert "intended_purpose" not in out # empty dropped
def test_unknown_keys_ignored(self):
out = parse_grenzen_json('{"fields": {"nonsense": {"value": "x"}}}')
assert out == {}
def test_string_entry_tolerated(self):
out = parse_grenzen_json('{"fields": {"manufacturer": "OWIS"}}')
assert out["manufacturer"]["value"] == "OWIS"
def test_bad_json(self):
assert parse_grenzen_json("not json") == {}
assert parse_grenzen_json("") == {}
class TestFollowups:
def test_empty_limits_asks_all_essentials(self):
fu = compute_followups({})
assert {f["key"] for f in fu} == _ESSENTIAL
assert all(f["question"] for f in fu)
def test_filled_essential_not_asked(self):
fu = compute_followups({"intended_purpose": "Positionieren"})
assert "intended_purpose" not in {f["key"] for f in fu}
def test_blank_string_still_asked(self):
fu = compute_followups({"intended_purpose": " "})
assert "intended_purpose" in {f["key"] for f in fu}
class TestMergeDetected:
def test_backfills_electrical_interfaces_excluding_usb(self):
limits, prov = {}, {}
_merge_detected(limits, prov, {"interfaces": ["Ethernet", "Modbus", "USB"]})
assert "Ethernet" in limits["electrical_interfaces"]
assert "USB" not in limits["electrical_interfaces"]
assert prov["electrical_interfaces"].startswith("deterministisch")
def test_does_not_overwrite_llm_value(self):
limits = {"electrical_interfaces": "PROFINET (vom LLM)"}
_merge_detected(limits, {}, {"interfaces": ["Ethernet"]})
assert limits["electrical_interfaces"] == "PROFINET (vom LLM)"