breakpilot-compliance/backend-compliance/compliance/services/assertion_engine.py

"""Assertion Engine — splits text into sentences and classifies each.

Each sentence is tagged as:
- assertion: normative statement (pflicht / empfehlung / kann)
- fact: references concrete evidence artifacts
- rationale: explains why something is required
"""

import re
from typing import Optional

from .normative_patterns import (
    PFLICHT_RE, EMPFEHLUNG_RE, KANN_RE, RATIONALE_RE, EVIDENCE_RE,
)

# Sentence splitter: period/excl/question followed by space+uppercase, or newlines
_SENTENCE_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])|(?:\n\s*\n)')


def extract_assertions(
    text: str,
    entity_type: str,
    entity_id: str,
    tenant_id: Optional[str] = None,
) -> list[dict]:
    """Split *text* into sentences and classify each one.

    Returns a list of dicts ready for AssertionDB creation.
    """
    if not text or not text.strip():
        return []

    sentences = _SENTENCE_SPLIT.split(text.strip())
    results: list[dict] = []

    for idx, raw in enumerate(sentences):
        sentence = raw.strip()
        if not sentence or len(sentence) < 5:
            continue

        assertion_type, normative_tier = _classify_sentence(sentence)

        results.append({
            "tenant_id": tenant_id,
            "entity_type": entity_type,
            "entity_id": entity_id,
            "sentence_text": sentence,
            "sentence_index": idx,
            "assertion_type": assertion_type,
            "normative_tier": normative_tier,
            "evidence_ids": [],
            "confidence": 0.0,
        })

    return results


def _classify_sentence(sentence: str) -> tuple[str, Optional[str]]:
    """Return (assertion_type, normative_tier) for a single sentence."""

    # 1. Check for evidence/fact keywords first
    if EVIDENCE_RE.search(sentence):
        return ("fact", None)

    # 2. Check for rationale
    normative_count = len(PFLICHT_RE.findall(sentence)) + len(EMPFEHLUNG_RE.findall(sentence)) + len(KANN_RE.findall(sentence))
    rationale_count = len(RATIONALE_RE.findall(sentence))
    if rationale_count > 0 and rationale_count >= normative_count:
        return ("rationale", None)

    # 3. Normative classification
    if PFLICHT_RE.search(sentence):
        return ("assertion", "pflicht")
    if EMPFEHLUNG_RE.search(sentence):
        return ("assertion", "empfehlung")
    if KANN_RE.search(sentence):
        return ("assertion", "kann")

    # 4. Default: unclassified assertion
    return ("assertion", None)