"""Assertion Engine — splits text into sentences and classifies each. Each sentence is tagged as: - assertion: normative statement (pflicht / empfehlung / kann) - fact: references concrete evidence artifacts - rationale: explains why something is required """ import re from typing import Optional from .normative_patterns import ( PFLICHT_RE, EMPFEHLUNG_RE, KANN_RE, RATIONALE_RE, EVIDENCE_RE, ) # Sentence splitter: period/excl/question followed by space+uppercase, or newlines _SENTENCE_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])|(?:\n\s*\n)') def extract_assertions( text: str, entity_type: str, entity_id: str, tenant_id: Optional[str] = None, ) -> list[dict]: """Split *text* into sentences and classify each one. Returns a list of dicts ready for AssertionDB creation. """ if not text or not text.strip(): return [] sentences = _SENTENCE_SPLIT.split(text.strip()) results: list[dict] = [] for idx, raw in enumerate(sentences): sentence = raw.strip() if not sentence or len(sentence) < 5: continue assertion_type, normative_tier = _classify_sentence(sentence) results.append({ "tenant_id": tenant_id, "entity_type": entity_type, "entity_id": entity_id, "sentence_text": sentence, "sentence_index": idx, "assertion_type": assertion_type, "normative_tier": normative_tier, "evidence_ids": [], "confidence": 0.0, }) return results def _classify_sentence(sentence: str) -> tuple[str, Optional[str]]: """Return (assertion_type, normative_tier) for a single sentence.""" # 1. Check for evidence/fact keywords first if EVIDENCE_RE.search(sentence): return ("fact", None) # 2. Check for rationale normative_count = len(PFLICHT_RE.findall(sentence)) + len(EMPFEHLUNG_RE.findall(sentence)) + len(KANN_RE.findall(sentence)) rationale_count = len(RATIONALE_RE.findall(sentence)) if rationale_count > 0 and rationale_count >= normative_count: return ("rationale", None) # 3. Normative classification if PFLICHT_RE.search(sentence): return ("assertion", "pflicht") if EMPFEHLUNG_RE.search(sentence): return ("assertion", "empfehlung") if KANN_RE.search(sentence): return ("assertion", "kann") # 4. Default: unclassified assertion return ("assertion", None)